Basic implementation of the task
This commit is contained in:
parent
b269c2223a
commit
e11750d54b
|
@ -6,13 +6,13 @@ authors = [{ name = "Peter Vacho", email = "p_vacho@utb.cz" }]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"numpy>=2.1.1",
|
"numpy>=2.1.1",
|
||||||
"matplotlib>=3.9.2",
|
"matplotlib>=3.9.2",
|
||||||
"pyqt5>=5.15.11",
|
|
||||||
"scikit-learn>=1.5.2",
|
"scikit-learn>=1.5.2",
|
||||||
"polars[all]>=1.9.0",
|
"polars[all]>=1.9.0",
|
||||||
"seaborn>=0.13.2",
|
"seaborn>=0.13.2",
|
||||||
"rich>=13.9.2",
|
"rich>=13.9.2",
|
||||||
"httpx>=0.27.2",
|
"httpx>=0.27.2",
|
||||||
"beautifulsoup4>=4.12.3",
|
"beautifulsoup4>=4.12.3",
|
||||||
|
"pyqt6>=6.7.1",
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">= 3.12"
|
requires-python = ">= 3.12"
|
||||||
|
|
|
@ -191,12 +191,12 @@ pyiceberg==0.7.1
|
||||||
pyparsing==3.1.4
|
pyparsing==3.1.4
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
# via pyiceberg
|
# via pyiceberg
|
||||||
pyqt5==5.15.11
|
pyqt6==6.7.1
|
||||||
# via task5
|
# via task5
|
||||||
pyqt5-qt5==5.15.15
|
pyqt6-qt6==6.7.3
|
||||||
# via pyqt5
|
# via pyqt6
|
||||||
pyqt5-sip==12.15.0
|
pyqt6-sip==13.8.0
|
||||||
# via pyqt5
|
# via pyqt6
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
# via pandas
|
# via pandas
|
||||||
|
|
|
@ -151,12 +151,12 @@ pyiceberg==0.7.1
|
||||||
pyparsing==3.1.4
|
pyparsing==3.1.4
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
# via pyiceberg
|
# via pyiceberg
|
||||||
pyqt5==5.15.11
|
pyqt6==6.7.1
|
||||||
# via task5
|
# via task5
|
||||||
pyqt5-qt5==5.15.15
|
pyqt6-qt6==6.7.3
|
||||||
# via pyqt5
|
# via pyqt6
|
||||||
pyqt5-sip==12.15.0
|
pyqt6-sip==13.8.0
|
||||||
# via pyqt5
|
# via pyqt6
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
# via pandas
|
# via pandas
|
||||||
|
|
160
src/__main__.py
160
src/__main__.py
|
@ -1,5 +1,165 @@
|
||||||
|
import os
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from time import perf_counter
|
||||||
|
from typing import cast, final
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from pandas import DataFrame, Series
|
||||||
|
from sklearn.datasets import fetch_openml
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn.manifold import TSNE
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
from sklearn.model_selection import GridSearchCV, train_test_split
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.utils import Bunch
|
||||||
|
|
||||||
|
# pyqt6 only bundles Windows & Fusion styles, which means that if you use a
|
||||||
|
# different preferred qt style, a warning would be produced. This gets rid
|
||||||
|
# of that warning and removes the env-var override.
|
||||||
|
if os.environ.get("QT_STYLE_OVERRIDE") not in {"Windows", "Fusion"}:
|
||||||
|
os.environ.pop("QT_STYLE_OVERRIDE")
|
||||||
|
|
||||||
|
|
||||||
|
@final
|
||||||
|
@dataclass
|
||||||
|
class MLDataset:
|
||||||
|
"""This is a structure that holds a dataset for Machine Learning.
|
||||||
|
|
||||||
|
The dataset is split into a training portion and a testing portion.
|
||||||
|
For both portions, this structure stores the features (x) and target values (y).
|
||||||
|
"""
|
||||||
|
|
||||||
|
x_train: DataFrame
|
||||||
|
y_train: "Series[str]"
|
||||||
|
x_test: DataFrame
|
||||||
|
y_test: "Series[str]"
|
||||||
|
|
||||||
|
def downsize(self, sample_size: int) -> "MLDataset":
|
||||||
|
"""Create a downsized dataset with given sample size.
|
||||||
|
|
||||||
|
If the sample size is higher than the size of the dataset, this will return
|
||||||
|
a new instance of the same dataset.
|
||||||
|
"""
|
||||||
|
return MLDataset(
|
||||||
|
x_train=self.x_train[:sample_size],
|
||||||
|
y_train=self.y_train[:sample_size],
|
||||||
|
x_test=self.x_test[:sample_size],
|
||||||
|
y_test=self.y_test[:sample_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_data() -> MLDataset:
|
||||||
|
"""Load the MNIST dataset."""
|
||||||
|
mnist = cast(Bunch, fetch_openml("mnist_784", version=1))
|
||||||
|
# While MNIST does generally have a predefined testing and training portions,
|
||||||
|
# sklearn stores it as a single dataset, so we'll do our own splitting.
|
||||||
|
x_train, x_test, y_train, y_test = train_test_split(
|
||||||
|
cast(DataFrame, mnist.data),
|
||||||
|
cast("Series[str]", mnist.target),
|
||||||
|
test_size=0.2,
|
||||||
|
random_state=12,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Scikit functions have terrible typing definitions...
|
||||||
|
return MLDataset(
|
||||||
|
x_train=x_train, # type: ignore[reportArgumentType]
|
||||||
|
y_train=y_train, # type: ignore[reportArgumentType]
|
||||||
|
x_test=x_test, # type: ignore[reportArgumentType]
|
||||||
|
y_test=y_test, # type: ignore[reportArgumentType]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def dim_reduce_pca(data: MLDataset, dimensions: int = 2) -> MLDataset:
|
||||||
|
"""Reduce dimensionality using the PCA method."""
|
||||||
|
pca = PCA(n_components=dimensions, random_state=12)
|
||||||
|
x_train_pca = pca.fit_transform(data.x_train)
|
||||||
|
x_test_pca = pca.fit_transform(data.x_test)
|
||||||
|
return MLDataset(x_train=x_train_pca, y_train=data.y_train, x_test=x_test_pca, y_test=data.y_test)
|
||||||
|
|
||||||
|
|
||||||
|
def dim_reduce_tsne(data: MLDataset, dimensions: int = 2, perplexity: int = 30, max_iter: int = 500) -> MLDataset:
|
||||||
|
"""Reduce dimensionality using the TSNE method."""
|
||||||
|
tsne = TSNE(
|
||||||
|
n_components=dimensions,
|
||||||
|
random_state=12,
|
||||||
|
perplexity=perplexity,
|
||||||
|
max_iter=max_iter,
|
||||||
|
n_jobs=-1, # Run in parallel, using all available processors
|
||||||
|
)
|
||||||
|
x_train_tsne = tsne.fit_transform(data.x_train)
|
||||||
|
x_test_tsne = tsne.fit_transform(data.x_test)
|
||||||
|
return MLDataset(x_train=x_train_tsne, y_train=data.y_train, x_test=x_test_tsne, y_test=data.y_test)
|
||||||
|
|
||||||
|
|
||||||
|
def knn_accuracy(data: MLDataset, k_range: range | None = None) -> tuple[object, float]:
|
||||||
|
"""Run KNN for various k values, searching for one giving the best accuracy."""
|
||||||
|
if k_range is None:
|
||||||
|
k_range = range(1, 11)
|
||||||
|
param_grid = {"n_neighbors": k_range}
|
||||||
|
knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
|
||||||
|
knn.fit(data.x_train, data.y_train)
|
||||||
|
best_k = knn.best_params_["n_neighbors"]
|
||||||
|
y_pred = knn.predict(data.x_test)
|
||||||
|
acc = accuracy_score(data.y_test, y_pred)
|
||||||
|
return best_k, acc
|
||||||
|
|
||||||
|
|
||||||
|
def plot_2d(x: DataFrame, y: "Series[str]", title: str) -> None:
|
||||||
|
"""Show a 2D visualization of the given 2D (dim-reduced) dataset."""
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, palette="tab10", legend="full", s=15)
|
||||||
|
plt.title(title)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def timed(start_msg: str, end_msg: str) -> Iterator[None]:
|
||||||
|
"""Context manager timing the progress of it's body."""
|
||||||
|
if start_msg != "":
|
||||||
|
print(start_msg + "...")
|
||||||
|
start = perf_counter()
|
||||||
|
yield
|
||||||
|
took = perf_counter() - start
|
||||||
|
print(end_msg + f" (took: {round(took, 2)}s)")
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
"""Program entrypoint."""
|
"""Program entrypoint."""
|
||||||
|
with timed("Loading the MNIST dataset", "MNIST dataset loaded"):
|
||||||
|
# Working with the entire dataset would be way too computationally expensive
|
||||||
|
# (TSNE would take hours, if not more), instead, downsize the dataset and work
|
||||||
|
# with a smaller sample
|
||||||
|
mnist = load_data().downsize(8000)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
with timed("Reducing dimensionality using PCA", "PCA Finished"):
|
||||||
|
mnist_pca = dim_reduce_pca(mnist)
|
||||||
|
|
||||||
|
with timed("Reducing dimensionality using t-SNE", "t-SNE Finished"):
|
||||||
|
mnist_tsne = dim_reduce_tsne(mnist)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
with timed("Measuring KNN accuracies", "Finished"):
|
||||||
|
k_original, acc_original = knn_accuracy(mnist)
|
||||||
|
k_pca, acc_pca = knn_accuracy(mnist_pca)
|
||||||
|
k_tsne, acc_tsne = knn_accuracy(mnist_tsne)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f"Original data: Best k={k_original}, Accuracy={acc_original:.2f}")
|
||||||
|
print(f"PCA reduced data: Best k={k_pca}, Accuracy={acc_pca:.2f}")
|
||||||
|
print(f"t-SNE reduced data: Best k={k_tsne}, Accuracy={acc_tsne:.2f}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
with timed("Showing graphs", "Finished"):
|
||||||
|
plot_2d(mnist_pca.x_train, mnist_pca.y_train, "2D PCA of MNIST")
|
||||||
|
plot_2d(mnist_tsne.x_train, mnist_tsne.y_train, "2D t-SNE of MNIST")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue