From e11750d54b343701c4df4579bb6590c85b99ac61 Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Sat, 30 Nov 2024 00:35:18 +0100 Subject: [PATCH] Basic implementation of the task --- pyproject.toml | 2 +- requirements-dev.lock | 10 +-- requirements.lock | 10 +-- src/__main__.py | 160 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 171 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c67986f..0e7d01c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,13 +6,13 @@ authors = [{ name = "Peter Vacho", email = "p_vacho@utb.cz" }] dependencies = [ "numpy>=2.1.1", "matplotlib>=3.9.2", - "pyqt5>=5.15.11", "scikit-learn>=1.5.2", "polars[all]>=1.9.0", "seaborn>=0.13.2", "rich>=13.9.2", "httpx>=0.27.2", "beautifulsoup4>=4.12.3", + "pyqt6>=6.7.1", ] readme = "README.md" requires-python = ">= 3.12" diff --git a/requirements-dev.lock b/requirements-dev.lock index 1959d1e..361a5f4 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -191,12 +191,12 @@ pyiceberg==0.7.1 pyparsing==3.1.4 # via matplotlib # via pyiceberg -pyqt5==5.15.11 +pyqt6==6.7.1 # via task5 -pyqt5-qt5==5.15.15 - # via pyqt5 -pyqt5-sip==12.15.0 - # via pyqt5 +pyqt6-qt6==6.7.3 + # via pyqt6 +pyqt6-sip==13.8.0 + # via pyqt6 python-dateutil==2.9.0.post0 # via matplotlib # via pandas diff --git a/requirements.lock b/requirements.lock index 5b9c8e7..ad14287 100644 --- a/requirements.lock +++ b/requirements.lock @@ -151,12 +151,12 @@ pyiceberg==0.7.1 pyparsing==3.1.4 # via matplotlib # via pyiceberg -pyqt5==5.15.11 +pyqt6==6.7.1 # via task5 -pyqt5-qt5==5.15.15 - # via pyqt5 -pyqt5-sip==12.15.0 - # via pyqt5 +pyqt6-qt6==6.7.3 + # via pyqt6 +pyqt6-sip==13.8.0 + # via pyqt6 python-dateutil==2.9.0.post0 # via matplotlib # via pandas diff --git a/src/__main__.py b/src/__main__.py index 23fb5a3..ec84bf7 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,5 +1,165 @@ +import os +from collections.abc import Iterator +from contextlib import contextmanager +from dataclasses import dataclass +from time import perf_counter +from typing import cast, final + +import matplotlib.pyplot as plt +import seaborn as sns +from pandas import DataFrame, Series +from sklearn.datasets import fetch_openml +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE +from sklearn.metrics import accuracy_score +from sklearn.model_selection import GridSearchCV, train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.utils import Bunch + +# pyqt6 only bundles Windows & Fusion styles, which means that if you use a +# different preferred qt style, a warning would be produced. This gets rid +# of that warning and removes the env-var override. +if os.environ.get("QT_STYLE_OVERRIDE") not in {"Windows", "Fusion"}: + os.environ.pop("QT_STYLE_OVERRIDE") + + +@final +@dataclass +class MLDataset: + """This is a structure that holds a dataset for Machine Learning. + + The dataset is split into a training portion and a testing portion. + For both portions, this structure stores the features (x) and target values (y). + """ + + x_train: DataFrame + y_train: "Series[str]" + x_test: DataFrame + y_test: "Series[str]" + + def downsize(self, sample_size: int) -> "MLDataset": + """Create a downsized dataset with given sample size. + + If the sample size is higher than the size of the dataset, this will return + a new instance of the same dataset. + """ + return MLDataset( + x_train=self.x_train[:sample_size], + y_train=self.y_train[:sample_size], + x_test=self.x_test[:sample_size], + y_test=self.y_test[:sample_size], + ) + + +def load_data() -> MLDataset: + """Load the MNIST dataset.""" + mnist = cast(Bunch, fetch_openml("mnist_784", version=1)) + # While MNIST does generally have a predefined testing and training portions, + # sklearn stores it as a single dataset, so we'll do our own splitting. + x_train, x_test, y_train, y_test = train_test_split( + cast(DataFrame, mnist.data), + cast("Series[str]", mnist.target), + test_size=0.2, + random_state=12, + ) + + # Scikit functions have terrible typing definitions... + return MLDataset( + x_train=x_train, # type: ignore[reportArgumentType] + y_train=y_train, # type: ignore[reportArgumentType] + x_test=x_test, # type: ignore[reportArgumentType] + y_test=y_test, # type: ignore[reportArgumentType] + ) + + +def dim_reduce_pca(data: MLDataset, dimensions: int = 2) -> MLDataset: + """Reduce dimensionality using the PCA method.""" + pca = PCA(n_components=dimensions, random_state=12) + x_train_pca = pca.fit_transform(data.x_train) + x_test_pca = pca.fit_transform(data.x_test) + return MLDataset(x_train=x_train_pca, y_train=data.y_train, x_test=x_test_pca, y_test=data.y_test) + + +def dim_reduce_tsne(data: MLDataset, dimensions: int = 2, perplexity: int = 30, max_iter: int = 500) -> MLDataset: + """Reduce dimensionality using the TSNE method.""" + tsne = TSNE( + n_components=dimensions, + random_state=12, + perplexity=perplexity, + max_iter=max_iter, + n_jobs=-1, # Run in parallel, using all available processors + ) + x_train_tsne = tsne.fit_transform(data.x_train) + x_test_tsne = tsne.fit_transform(data.x_test) + return MLDataset(x_train=x_train_tsne, y_train=data.y_train, x_test=x_test_tsne, y_test=data.y_test) + + +def knn_accuracy(data: MLDataset, k_range: range | None = None) -> tuple[object, float]: + """Run KNN for various k values, searching for one giving the best accuracy.""" + if k_range is None: + k_range = range(1, 11) + param_grid = {"n_neighbors": k_range} + knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5) + knn.fit(data.x_train, data.y_train) + best_k = knn.best_params_["n_neighbors"] + y_pred = knn.predict(data.x_test) + acc = accuracy_score(data.y_test, y_pred) + return best_k, acc + + +def plot_2d(x: DataFrame, y: "Series[str]", title: str) -> None: + """Show a 2D visualization of the given 2D (dim-reduced) dataset.""" + plt.figure(figsize=(8, 6)) + sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, palette="tab10", legend="full", s=15) + plt.title(title) + plt.show() + + +@contextmanager +def timed(start_msg: str, end_msg: str) -> Iterator[None]: + """Context manager timing the progress of it's body.""" + if start_msg != "": + print(start_msg + "...") + start = perf_counter() + yield + took = perf_counter() - start + print(end_msg + f" (took: {round(took, 2)}s)") + + def main() -> None: """Program entrypoint.""" + with timed("Loading the MNIST dataset", "MNIST dataset loaded"): + # Working with the entire dataset would be way too computationally expensive + # (TSNE would take hours, if not more), instead, downsize the dataset and work + # with a smaller sample + mnist = load_data().downsize(8000) + + print() + + with timed("Reducing dimensionality using PCA", "PCA Finished"): + mnist_pca = dim_reduce_pca(mnist) + + with timed("Reducing dimensionality using t-SNE", "t-SNE Finished"): + mnist_tsne = dim_reduce_tsne(mnist) + + print() + + with timed("Measuring KNN accuracies", "Finished"): + k_original, acc_original = knn_accuracy(mnist) + k_pca, acc_pca = knn_accuracy(mnist_pca) + k_tsne, acc_tsne = knn_accuracy(mnist_tsne) + + print() + + print(f"Original data: Best k={k_original}, Accuracy={acc_original:.2f}") + print(f"PCA reduced data: Best k={k_pca}, Accuracy={acc_pca:.2f}") + print(f"t-SNE reduced data: Best k={k_tsne}, Accuracy={acc_tsne:.2f}") + + print() + + with timed("Showing graphs", "Finished"): + plot_2d(mnist_pca.x_train, mnist_pca.y_train, "2D PCA of MNIST") + plot_2d(mnist_tsne.x_train, mnist_tsne.y_train, "2D t-SNE of MNIST") if __name__ == "__main__":