-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_for_PCA.py
executable file
·87 lines (68 loc) · 2.56 KB
/
analysis_for_PCA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import librosa
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
from tqdm import tqdm
from joblib import Memory, Parallel, delayed
# Directory with datasets.
audio_dir_prefix = r""
# Dataset to use.
selected = "fma_small"
upper_limit = 153
# Generate folder names from 000 to as many as in dataset.
folder_names = [f"{i:03d}" for i in range(upper_limit)]
audio_files = []
for folder in folder_names:
folder_path = os.path.join(audio_dir_prefix, selected, folder)
if os.path.exists(folder_path):
audio_files.extend(
[os.path.join(folder_path, file)
for file in os.listdir(folder_path)]
)
# Convert audio_files to a numpy array.
audio_files = np.array(audio_files)
# Select a random sample of audio files
indices = np.random.choice(len(audio_files), size=500, replace=False)
sample_files = audio_files[indices]
# Create a memory object for caching
memory = Memory("cache_directory", verbose=0)
@memory.cache
def process_file(file):
# Load the audio file.
y, sr = librosa.load(file)
# Extract MFCC features.
mfcc = librosa.feature.mfcc(y=y, sr=sr)
# Standardize the MFCC features.
scaler = StandardScaler()
mfcc_scaled = scaler.fit_transform(mfcc.T).T
return mfcc_scaled
# Use joblib to process the files in parallel
mfcc_features_list = Parallel(n_jobs=-1)(
delayed(process_file)(file) for file in tqdm(
sample_files, total=len(sample_files))
)
# Concatenate all the MFCC features into a single 2D array
mfcc_features = np.concatenate(mfcc_features_list, axis=1)
# Fit PCA on the standardized features without reducing dimensionality.
pca = PCA()
mfcc_pca = pca.fit_transform(
mfcc_features.T) # Transpose to have samples as rows
# Calculate cumulative explained variance ratio.
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(cumulative_variance, marker="o")
plt.title("Cumulative Explained Variance by PCA Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.grid(True)
plt.axhline(y=0.95, color="r", linestyle="--") # Line at 95% variance
plt.axvline(
x=np.where(cumulative_variance >= 0.95)[0][0], color="r", linestyle="--"
) # Line at the optimal component count
plt.show()
# Print optimal number of components for at least 95% variance
optimal_components = np.where(cumulative_variance >= 0.95)[0][0] + 1
print("Optimal number of components:", optimal_components)