Basic implementation of the task
This commit is contained in:
parent
b269c2223a
commit
e11750d54b
|
@ -6,13 +6,13 @@ authors = [{ name = "Peter Vacho", email = "p_vacho@utb.cz" }]
|
|||
dependencies = [
|
||||
"numpy>=2.1.1",
|
||||
"matplotlib>=3.9.2",
|
||||
"pyqt5>=5.15.11",
|
||||
"scikit-learn>=1.5.2",
|
||||
"polars[all]>=1.9.0",
|
||||
"seaborn>=0.13.2",
|
||||
"rich>=13.9.2",
|
||||
"httpx>=0.27.2",
|
||||
"beautifulsoup4>=4.12.3",
|
||||
"pyqt6>=6.7.1",
|
||||
]
|
||||
readme = "README.md"
|
||||
requires-python = ">= 3.12"
|
||||
|
|
|
@ -191,12 +191,12 @@ pyiceberg==0.7.1
|
|||
pyparsing==3.1.4
|
||||
# via matplotlib
|
||||
# via pyiceberg
|
||||
pyqt5==5.15.11
|
||||
pyqt6==6.7.1
|
||||
# via task5
|
||||
pyqt5-qt5==5.15.15
|
||||
# via pyqt5
|
||||
pyqt5-sip==12.15.0
|
||||
# via pyqt5
|
||||
pyqt6-qt6==6.7.3
|
||||
# via pyqt6
|
||||
pyqt6-sip==13.8.0
|
||||
# via pyqt6
|
||||
python-dateutil==2.9.0.post0
|
||||
# via matplotlib
|
||||
# via pandas
|
||||
|
|
|
@ -151,12 +151,12 @@ pyiceberg==0.7.1
|
|||
pyparsing==3.1.4
|
||||
# via matplotlib
|
||||
# via pyiceberg
|
||||
pyqt5==5.15.11
|
||||
pyqt6==6.7.1
|
||||
# via task5
|
||||
pyqt5-qt5==5.15.15
|
||||
# via pyqt5
|
||||
pyqt5-sip==12.15.0
|
||||
# via pyqt5
|
||||
pyqt6-qt6==6.7.3
|
||||
# via pyqt6
|
||||
pyqt6-sip==13.8.0
|
||||
# via pyqt6
|
||||
python-dateutil==2.9.0.post0
|
||||
# via matplotlib
|
||||
# via pandas
|
||||
|
|
160
src/__main__.py
160
src/__main__.py
|
@ -1,5 +1,165 @@
|
|||
import os
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from time import perf_counter
|
||||
from typing import cast, final
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pandas import DataFrame, Series
|
||||
from sklearn.datasets import fetch_openml
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.model_selection import GridSearchCV, train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.utils import Bunch
|
||||
|
||||
# pyqt6 only bundles Windows & Fusion styles, which means that if you use a
|
||||
# different preferred qt style, a warning would be produced. This gets rid
|
||||
# of that warning and removes the env-var override.
|
||||
if os.environ.get("QT_STYLE_OVERRIDE") not in {"Windows", "Fusion"}:
|
||||
os.environ.pop("QT_STYLE_OVERRIDE")
|
||||
|
||||
|
||||
@final
|
||||
@dataclass
|
||||
class MLDataset:
|
||||
"""This is a structure that holds a dataset for Machine Learning.
|
||||
|
||||
The dataset is split into a training portion and a testing portion.
|
||||
For both portions, this structure stores the features (x) and target values (y).
|
||||
"""
|
||||
|
||||
x_train: DataFrame
|
||||
y_train: "Series[str]"
|
||||
x_test: DataFrame
|
||||
y_test: "Series[str]"
|
||||
|
||||
def downsize(self, sample_size: int) -> "MLDataset":
|
||||
"""Create a downsized dataset with given sample size.
|
||||
|
||||
If the sample size is higher than the size of the dataset, this will return
|
||||
a new instance of the same dataset.
|
||||
"""
|
||||
return MLDataset(
|
||||
x_train=self.x_train[:sample_size],
|
||||
y_train=self.y_train[:sample_size],
|
||||
x_test=self.x_test[:sample_size],
|
||||
y_test=self.y_test[:sample_size],
|
||||
)
|
||||
|
||||
|
||||
def load_data() -> MLDataset:
|
||||
"""Load the MNIST dataset."""
|
||||
mnist = cast(Bunch, fetch_openml("mnist_784", version=1))
|
||||
# While MNIST does generally have a predefined testing and training portions,
|
||||
# sklearn stores it as a single dataset, so we'll do our own splitting.
|
||||
x_train, x_test, y_train, y_test = train_test_split(
|
||||
cast(DataFrame, mnist.data),
|
||||
cast("Series[str]", mnist.target),
|
||||
test_size=0.2,
|
||||
random_state=12,
|
||||
)
|
||||
|
||||
# Scikit functions have terrible typing definitions...
|
||||
return MLDataset(
|
||||
x_train=x_train, # type: ignore[reportArgumentType]
|
||||
y_train=y_train, # type: ignore[reportArgumentType]
|
||||
x_test=x_test, # type: ignore[reportArgumentType]
|
||||
y_test=y_test, # type: ignore[reportArgumentType]
|
||||
)
|
||||
|
||||
|
||||
def dim_reduce_pca(data: MLDataset, dimensions: int = 2) -> MLDataset:
|
||||
"""Reduce dimensionality using the PCA method."""
|
||||
pca = PCA(n_components=dimensions, random_state=12)
|
||||
x_train_pca = pca.fit_transform(data.x_train)
|
||||
x_test_pca = pca.fit_transform(data.x_test)
|
||||
return MLDataset(x_train=x_train_pca, y_train=data.y_train, x_test=x_test_pca, y_test=data.y_test)
|
||||
|
||||
|
||||
def dim_reduce_tsne(data: MLDataset, dimensions: int = 2, perplexity: int = 30, max_iter: int = 500) -> MLDataset:
|
||||
"""Reduce dimensionality using the TSNE method."""
|
||||
tsne = TSNE(
|
||||
n_components=dimensions,
|
||||
random_state=12,
|
||||
perplexity=perplexity,
|
||||
max_iter=max_iter,
|
||||
n_jobs=-1, # Run in parallel, using all available processors
|
||||
)
|
||||
x_train_tsne = tsne.fit_transform(data.x_train)
|
||||
x_test_tsne = tsne.fit_transform(data.x_test)
|
||||
return MLDataset(x_train=x_train_tsne, y_train=data.y_train, x_test=x_test_tsne, y_test=data.y_test)
|
||||
|
||||
|
||||
def knn_accuracy(data: MLDataset, k_range: range | None = None) -> tuple[object, float]:
|
||||
"""Run KNN for various k values, searching for one giving the best accuracy."""
|
||||
if k_range is None:
|
||||
k_range = range(1, 11)
|
||||
param_grid = {"n_neighbors": k_range}
|
||||
knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
|
||||
knn.fit(data.x_train, data.y_train)
|
||||
best_k = knn.best_params_["n_neighbors"]
|
||||
y_pred = knn.predict(data.x_test)
|
||||
acc = accuracy_score(data.y_test, y_pred)
|
||||
return best_k, acc
|
||||
|
||||
|
||||
def plot_2d(x: DataFrame, y: "Series[str]", title: str) -> None:
|
||||
"""Show a 2D visualization of the given 2D (dim-reduced) dataset."""
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, palette="tab10", legend="full", s=15)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def timed(start_msg: str, end_msg: str) -> Iterator[None]:
|
||||
"""Context manager timing the progress of it's body."""
|
||||
if start_msg != "":
|
||||
print(start_msg + "...")
|
||||
start = perf_counter()
|
||||
yield
|
||||
took = perf_counter() - start
|
||||
print(end_msg + f" (took: {round(took, 2)}s)")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Program entrypoint."""
|
||||
with timed("Loading the MNIST dataset", "MNIST dataset loaded"):
|
||||
# Working with the entire dataset would be way too computationally expensive
|
||||
# (TSNE would take hours, if not more), instead, downsize the dataset and work
|
||||
# with a smaller sample
|
||||
mnist = load_data().downsize(8000)
|
||||
|
||||
print()
|
||||
|
||||
with timed("Reducing dimensionality using PCA", "PCA Finished"):
|
||||
mnist_pca = dim_reduce_pca(mnist)
|
||||
|
||||
with timed("Reducing dimensionality using t-SNE", "t-SNE Finished"):
|
||||
mnist_tsne = dim_reduce_tsne(mnist)
|
||||
|
||||
print()
|
||||
|
||||
with timed("Measuring KNN accuracies", "Finished"):
|
||||
k_original, acc_original = knn_accuracy(mnist)
|
||||
k_pca, acc_pca = knn_accuracy(mnist_pca)
|
||||
k_tsne, acc_tsne = knn_accuracy(mnist_tsne)
|
||||
|
||||
print()
|
||||
|
||||
print(f"Original data: Best k={k_original}, Accuracy={acc_original:.2f}")
|
||||
print(f"PCA reduced data: Best k={k_pca}, Accuracy={acc_pca:.2f}")
|
||||
print(f"t-SNE reduced data: Best k={k_tsne}, Accuracy={acc_tsne:.2f}")
|
||||
|
||||
print()
|
||||
|
||||
with timed("Showing graphs", "Finished"):
|
||||
plot_2d(mnist_pca.x_train, mnist_pca.y_train, "2D PCA of MNIST")
|
||||
plot_2d(mnist_tsne.x_train, mnist_tsne.y_train, "2D t-SNE of MNIST")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in a new issue