Basic implementation of the task

2024-11-30 00:35:18 +01:00 · 2024-11-30 00:35:18 +01:00 · e11750d54b
parent b269c2223a
commit e11750d54b
4 changed files with 171 additions and 11 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,13 +6,13 @@ authors = [{ name = "Peter Vacho", email = "p_vacho@utb.cz" }]
 dependencies = [
    "numpy>=2.1.1",
    "matplotlib>=3.9.2",
-    "pyqt5>=5.15.11",
    "scikit-learn>=1.5.2",
    "polars[all]>=1.9.0",
    "seaborn>=0.13.2",
    "rich>=13.9.2",
    "httpx>=0.27.2",
    "beautifulsoup4>=4.12.3",
+    "pyqt6>=6.7.1",
 ]
 readme = "README.md"
 requires-python = ">= 3.12"
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -191,12 +191,12 @@ pyiceberg==0.7.1
 pyparsing==3.1.4
    # via matplotlib
    # via pyiceberg
-pyqt5==5.15.11
+pyqt6==6.7.1
    # via task5
-pyqt5-qt5==5.15.15
-    # via pyqt5
-pyqt5-sip==12.15.0
-    # via pyqt5
+pyqt6-qt6==6.7.3
+    # via pyqt6
+pyqt6-sip==13.8.0
+    # via pyqt6
 python-dateutil==2.9.0.post0
    # via matplotlib
    # via pandas
--- a/requirements.lock
+++ b/requirements.lock
@ -151,12 +151,12 @@ pyiceberg==0.7.1
 pyparsing==3.1.4
    # via matplotlib
    # via pyiceberg
-pyqt5==5.15.11
+pyqt6==6.7.1
    # via task5
-pyqt5-qt5==5.15.15
-    # via pyqt5
-pyqt5-sip==12.15.0
-    # via pyqt5
+pyqt6-qt6==6.7.3
+    # via pyqt6
+pyqt6-sip==13.8.0
+    # via pyqt6
 python-dateutil==2.9.0.post0
    # via matplotlib
    # via pandas
--- a/src/main.py
+++ b/src/main.py
@ -1,5 +1,165 @@
+import os
+from collections.abc import Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass
+from time import perf_counter
+from typing import cast, final
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pandas import DataFrame, Series
+from sklearn.datasets import fetch_openml
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.utils import Bunch
+
+# pyqt6 only bundles Windows & Fusion styles, which means that if you use a
+# different preferred qt style, a warning would be produced. This gets rid
+# of that warning and removes the env-var override.
+if os.environ.get("QT_STYLE_OVERRIDE") not in {"Windows", "Fusion"}:
+    os.environ.pop("QT_STYLE_OVERRIDE")
+
+
+@final
+@dataclass
+class MLDataset:
+    """This is a structure that holds a dataset for Machine Learning.
+
+    The dataset is split into a training portion and a testing portion.
+    For both portions, this structure stores the features (x) and target values (y).
+    """
+
+    x_train: DataFrame
+    y_train: "Series[str]"
+    x_test: DataFrame
+    y_test: "Series[str]"
+
+    def downsize(self, sample_size: int) -> "MLDataset":
+        """Create a downsized dataset with given sample size.
+
+        If the sample size is higher than the size of the dataset, this will return
+        a new instance of the same dataset.
+        """
+        return MLDataset(
+            x_train=self.x_train[:sample_size],
+            y_train=self.y_train[:sample_size],
+            x_test=self.x_test[:sample_size],
+            y_test=self.y_test[:sample_size],
+        )
+
+
+def load_data() -> MLDataset:
+    """Load the MNIST dataset."""
+    mnist = cast(Bunch, fetch_openml("mnist_784", version=1))
+    # While MNIST does generally have a predefined testing and training portions,
+    # sklearn stores it as a single dataset, so we'll do our own splitting.
+    x_train, x_test, y_train, y_test = train_test_split(
+        cast(DataFrame, mnist.data),
+        cast("Series[str]", mnist.target),
+        test_size=0.2,
+        random_state=12,
+    )
+
+    # Scikit functions have terrible typing definitions...
+    return MLDataset(
+        x_train=x_train,  # type: ignore[reportArgumentType]
+        y_train=y_train,  # type: ignore[reportArgumentType]
+        x_test=x_test,  # type: ignore[reportArgumentType]
+        y_test=y_test,  # type: ignore[reportArgumentType]
+    )
+
+
+def dim_reduce_pca(data: MLDataset, dimensions: int = 2) -> MLDataset:
+    """Reduce dimensionality using the PCA method."""
+    pca = PCA(n_components=dimensions, random_state=12)
+    x_train_pca = pca.fit_transform(data.x_train)
+    x_test_pca = pca.fit_transform(data.x_test)
+    return MLDataset(x_train=x_train_pca, y_train=data.y_train, x_test=x_test_pca, y_test=data.y_test)
+
+
+def dim_reduce_tsne(data: MLDataset, dimensions: int = 2, perplexity: int = 30, max_iter: int = 500) -> MLDataset:
+    """Reduce dimensionality using the TSNE method."""
+    tsne = TSNE(
+        n_components=dimensions,
+        random_state=12,
+        perplexity=perplexity,
+        max_iter=max_iter,
+        n_jobs=-1,  # Run in parallel, using all available processors
+    )
+    x_train_tsne = tsne.fit_transform(data.x_train)
+    x_test_tsne = tsne.fit_transform(data.x_test)
+    return MLDataset(x_train=x_train_tsne, y_train=data.y_train, x_test=x_test_tsne, y_test=data.y_test)
+
+
+def knn_accuracy(data: MLDataset, k_range: range | None = None) -> tuple[object, float]:
+    """Run KNN for various k values, searching for one giving the best accuracy."""
+    if k_range is None:
+        k_range = range(1, 11)
+    param_grid = {"n_neighbors": k_range}
+    knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
+    knn.fit(data.x_train, data.y_train)
+    best_k = knn.best_params_["n_neighbors"]
+    y_pred = knn.predict(data.x_test)
+    acc = accuracy_score(data.y_test, y_pred)
+    return best_k, acc
+
+
+def plot_2d(x: DataFrame, y: "Series[str]", title: str) -> None:
+    """Show a 2D visualization of the given 2D (dim-reduced) dataset."""
+    plt.figure(figsize=(8, 6))
+    sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, palette="tab10", legend="full", s=15)
+    plt.title(title)
+    plt.show()
+
+
+@contextmanager
+def timed(start_msg: str, end_msg: str) -> Iterator[None]:
+    """Context manager timing the progress of it's body."""
+    if start_msg != "":
+        print(start_msg + "...")
+    start = perf_counter()
+    yield
+    took = perf_counter() - start
+    print(end_msg + f" (took: {round(took, 2)}s)")
+
+
 def main() -> None:
    """Program entrypoint."""
+    with timed("Loading the MNIST dataset", "MNIST dataset loaded"):
+        # Working with the entire dataset would be way too computationally expensive
+        # (TSNE would take hours, if not more), instead, downsize the dataset and work
+        # with a smaller sample
+        mnist = load_data().downsize(8000)
+
+    print()
+
+    with timed("Reducing dimensionality using PCA", "PCA Finished"):
+        mnist_pca = dim_reduce_pca(mnist)
+
+    with timed("Reducing dimensionality using t-SNE", "t-SNE Finished"):
+        mnist_tsne = dim_reduce_tsne(mnist)
+
+    print()
+
+    with timed("Measuring KNN accuracies", "Finished"):
+        k_original, acc_original = knn_accuracy(mnist)
+        k_pca, acc_pca = knn_accuracy(mnist_pca)
+        k_tsne, acc_tsne = knn_accuracy(mnist_tsne)
+
+    print()
+
+    print(f"Original data: Best k={k_original}, Accuracy={acc_original:.2f}")
+    print(f"PCA reduced data: Best k={k_pca}, Accuracy={acc_pca:.2f}")
+    print(f"t-SNE reduced data: Best k={k_tsne}, Accuracy={acc_tsne:.2f}")
+
+    print()
+
+    with timed("Showing graphs", "Finished"):
+        plot_2d(mnist_pca.x_train, mnist_pca.y_train, "2D PCA of MNIST")
+        plot_2d(mnist_tsne.x_train, mnist_tsne.y_train, "2D t-SNE of MNIST")


 if __name__ == "__main__":