# Step 5: LDA (if labels exist) if labels is not None: lda = LDA(n_components=min(2, len(np.unique(labels))-1)) lda_scores = lda.fit_transform(data_scaled, labels) print("LDA applied. Reduced shape:", lda_scores.shape) # LDA scatter plot plt.figure() for lab in np.unique(labels): subset = lda_scores[labels == lab] plt.scatter(subset[:,0], subset[:,1], label=f'Class {lab}') plt.legend() plt.title('LDA Projection') plt.savefig('lda_plot.png')
# Step 2: Scale features scaler = StandardScaler() data_scaled = scaler.fit_transform(data_imp) mva script
# mva_script.py import pandas as pd import numpy as np from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.cluster import KMeans import matplotlib.pyplot as plt import seaborn as sns def run_mva(data, labels=None, variance_threshold=0.8): """ Complete MVA pipeline. Parameters: data : pd.DataFrame or np.array labels : array-like, optional (for LDA) variance_threshold : float, cumulative variance for PCA """ # Step 1: Impute missing values imputer = SimpleImputer(strategy='median') data_imp = imputer.fit_transform(data) # Step 5: LDA (if labels exist) if
# Step 3: PCA pca = PCA(n_components=min(data_scaled.shape[1], 10)) pca_scores = pca.fit_transform(data_scaled) cum_var = np.cumsum(pca.explained_variance_ratio_) n_comp = np.argmax(cum_var >= variance_threshold) + 1 print(f"Optimal PCA components: {n_comp} (explained {cum_var[n_comp-1]:.2%})") labels) print("LDA applied. Reduced shape:"