Basic implementation of the task

This commit is contained in:
Peter Vacho 2024-11-30 00:35:18 +01:00
parent b269c2223a
commit e11750d54b
Signed by: school
GPG key ID: 8CFC3837052871B4
4 changed files with 171 additions and 11 deletions

View file

@ -6,13 +6,13 @@ authors = [{ name = "Peter Vacho", email = "p_vacho@utb.cz" }]
dependencies = [ dependencies = [
"numpy>=2.1.1", "numpy>=2.1.1",
"matplotlib>=3.9.2", "matplotlib>=3.9.2",
"pyqt5>=5.15.11",
"scikit-learn>=1.5.2", "scikit-learn>=1.5.2",
"polars[all]>=1.9.0", "polars[all]>=1.9.0",
"seaborn>=0.13.2", "seaborn>=0.13.2",
"rich>=13.9.2", "rich>=13.9.2",
"httpx>=0.27.2", "httpx>=0.27.2",
"beautifulsoup4>=4.12.3", "beautifulsoup4>=4.12.3",
"pyqt6>=6.7.1",
] ]
readme = "README.md" readme = "README.md"
requires-python = ">= 3.12" requires-python = ">= 3.12"

View file

@ -191,12 +191,12 @@ pyiceberg==0.7.1
pyparsing==3.1.4 pyparsing==3.1.4
# via matplotlib # via matplotlib
# via pyiceberg # via pyiceberg
pyqt5==5.15.11 pyqt6==6.7.1
# via task5 # via task5
pyqt5-qt5==5.15.15 pyqt6-qt6==6.7.3
# via pyqt5 # via pyqt6
pyqt5-sip==12.15.0 pyqt6-sip==13.8.0
# via pyqt5 # via pyqt6
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via matplotlib # via matplotlib
# via pandas # via pandas

View file

@ -151,12 +151,12 @@ pyiceberg==0.7.1
pyparsing==3.1.4 pyparsing==3.1.4
# via matplotlib # via matplotlib
# via pyiceberg # via pyiceberg
pyqt5==5.15.11 pyqt6==6.7.1
# via task5 # via task5
pyqt5-qt5==5.15.15 pyqt6-qt6==6.7.3
# via pyqt5 # via pyqt6
pyqt5-sip==12.15.0 pyqt6-sip==13.8.0
# via pyqt5 # via pyqt6
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via matplotlib # via matplotlib
# via pandas # via pandas

View file

@ -1,5 +1,165 @@
import os
from collections.abc import Iterator
from contextlib import contextmanager
from dataclasses import dataclass
from time import perf_counter
from typing import cast, final
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame, Series
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import Bunch
# pyqt6 only bundles Windows & Fusion styles, which means that if you use a
# different preferred qt style, a warning would be produced. This gets rid
# of that warning and removes the env-var override.
if os.environ.get("QT_STYLE_OVERRIDE") not in {"Windows", "Fusion"}:
os.environ.pop("QT_STYLE_OVERRIDE")
@final
@dataclass
class MLDataset:
"""This is a structure that holds a dataset for Machine Learning.
The dataset is split into a training portion and a testing portion.
For both portions, this structure stores the features (x) and target values (y).
"""
x_train: DataFrame
y_train: "Series[str]"
x_test: DataFrame
y_test: "Series[str]"
def downsize(self, sample_size: int) -> "MLDataset":
"""Create a downsized dataset with given sample size.
If the sample size is higher than the size of the dataset, this will return
a new instance of the same dataset.
"""
return MLDataset(
x_train=self.x_train[:sample_size],
y_train=self.y_train[:sample_size],
x_test=self.x_test[:sample_size],
y_test=self.y_test[:sample_size],
)
def load_data() -> MLDataset:
"""Load the MNIST dataset."""
mnist = cast(Bunch, fetch_openml("mnist_784", version=1))
# While MNIST does generally have a predefined testing and training portions,
# sklearn stores it as a single dataset, so we'll do our own splitting.
x_train, x_test, y_train, y_test = train_test_split(
cast(DataFrame, mnist.data),
cast("Series[str]", mnist.target),
test_size=0.2,
random_state=12,
)
# Scikit functions have terrible typing definitions...
return MLDataset(
x_train=x_train, # type: ignore[reportArgumentType]
y_train=y_train, # type: ignore[reportArgumentType]
x_test=x_test, # type: ignore[reportArgumentType]
y_test=y_test, # type: ignore[reportArgumentType]
)
def dim_reduce_pca(data: MLDataset, dimensions: int = 2) -> MLDataset:
"""Reduce dimensionality using the PCA method."""
pca = PCA(n_components=dimensions, random_state=12)
x_train_pca = pca.fit_transform(data.x_train)
x_test_pca = pca.fit_transform(data.x_test)
return MLDataset(x_train=x_train_pca, y_train=data.y_train, x_test=x_test_pca, y_test=data.y_test)
def dim_reduce_tsne(data: MLDataset, dimensions: int = 2, perplexity: int = 30, max_iter: int = 500) -> MLDataset:
"""Reduce dimensionality using the TSNE method."""
tsne = TSNE(
n_components=dimensions,
random_state=12,
perplexity=perplexity,
max_iter=max_iter,
n_jobs=-1, # Run in parallel, using all available processors
)
x_train_tsne = tsne.fit_transform(data.x_train)
x_test_tsne = tsne.fit_transform(data.x_test)
return MLDataset(x_train=x_train_tsne, y_train=data.y_train, x_test=x_test_tsne, y_test=data.y_test)
def knn_accuracy(data: MLDataset, k_range: range | None = None) -> tuple[object, float]:
"""Run KNN for various k values, searching for one giving the best accuracy."""
if k_range is None:
k_range = range(1, 11)
param_grid = {"n_neighbors": k_range}
knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
knn.fit(data.x_train, data.y_train)
best_k = knn.best_params_["n_neighbors"]
y_pred = knn.predict(data.x_test)
acc = accuracy_score(data.y_test, y_pred)
return best_k, acc
def plot_2d(x: DataFrame, y: "Series[str]", title: str) -> None:
"""Show a 2D visualization of the given 2D (dim-reduced) dataset."""
plt.figure(figsize=(8, 6))
sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, palette="tab10", legend="full", s=15)
plt.title(title)
plt.show()
@contextmanager
def timed(start_msg: str, end_msg: str) -> Iterator[None]:
"""Context manager timing the progress of it's body."""
if start_msg != "":
print(start_msg + "...")
start = perf_counter()
yield
took = perf_counter() - start
print(end_msg + f" (took: {round(took, 2)}s)")
def main() -> None: def main() -> None:
"""Program entrypoint.""" """Program entrypoint."""
with timed("Loading the MNIST dataset", "MNIST dataset loaded"):
# Working with the entire dataset would be way too computationally expensive
# (TSNE would take hours, if not more), instead, downsize the dataset and work
# with a smaller sample
mnist = load_data().downsize(8000)
print()
with timed("Reducing dimensionality using PCA", "PCA Finished"):
mnist_pca = dim_reduce_pca(mnist)
with timed("Reducing dimensionality using t-SNE", "t-SNE Finished"):
mnist_tsne = dim_reduce_tsne(mnist)
print()
with timed("Measuring KNN accuracies", "Finished"):
k_original, acc_original = knn_accuracy(mnist)
k_pca, acc_pca = knn_accuracy(mnist_pca)
k_tsne, acc_tsne = knn_accuracy(mnist_tsne)
print()
print(f"Original data: Best k={k_original}, Accuracy={acc_original:.2f}")
print(f"PCA reduced data: Best k={k_pca}, Accuracy={acc_pca:.2f}")
print(f"t-SNE reduced data: Best k={k_tsne}, Accuracy={acc_tsne:.2f}")
print()
with timed("Showing graphs", "Finished"):
plot_2d(mnist_pca.x_train, mnist_pca.y_train, "2D PCA of MNIST")
plot_2d(mnist_tsne.x_train, mnist_tsne.y_train, "2D t-SNE of MNIST")
if __name__ == "__main__": if __name__ == "__main__":