Kaiser-Meyer-Olkin (KMO) Test
from factor_analyzer import FactorAnalyzer
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np
# create a dataset
data = pd.read_csv('TopLog.csv')
data = data.drop('Sample', axis=1)
# scale data
scaler = RobustScaler()
data_scaled = scaler.fit_transform(data)
from factor_analyzer.factor_analyzer import calculate_kmo
# Perform KMO test
kmo_all,kmo_model=calculate_kmo(data_scaled)
print("KMO Test Results: ")
print("KMO overall: ", kmo_model)
KMO Test Results:
KMO overall: 0.7981399209156622
PCA Loadings Bootstrapping
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.utils import resample
import pandas as pd
import numpy as np
# initialize PCA
pca = PCA()
# create a dataset
data = pd.read_csv('TopLog.csv')
data = data.drop('Sample', axis=1)
# scale data
scaler = RobustScaler()
data_scaled = scaler.fit_transform(data)
# fit PCA on original data
orig_loadings = pca.fit(data_scaled).components_
# bootstrap
n_iterations = 1000
boot_loadings = []
for i in range(n_iterations):
# bootstrap sample
sample = resample(data_scaled)
# fit PCA on bootstrap sample
loadings = pca.fit(sample).components_
# store result
boot_loadings.append(loadings)
print("Bootstrap loadings: ", boot_loadings)
from scipy import stats
# transpose the boot_loadings_df for ANOVA
transposed = boot_loadings_df.T
# perform ANOVA
f_val, p_val = stats.f_oneway(*[group for name, group in transposed.iteritems()])
print("F-value: ", f_val)
print("p-value: ", p_val)
if p_val < 0.05:
print("There is a significant difference among the bootstrap results.")
else:
print("There is no significant difference among the bootstrap results.")
F-value: 0.34224044699631684
p-value: 0.9999999999999999
There is no significant difference among the bootstrap results.
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
import mpld3
# Open the HTML file for writing
html_file = open('TopLog_results.html', 'w')
# Load the data
data = pd.read_csv('TopLog.csv')
# Select the columns to use for the PCA
cols = ['Cu', 'As', 'Zn', 'Mn', 'Pb', 'Ni', 'Cr']
pca_data = data[cols]
# Scale the data
scaler = RobustScaler()
scaled_data = scaler.fit_transform(pca_data)
# Perform the PCA
pca = PCA(n_components=len(cols))
pca.fit(scaled_data)
pca_df = pd.DataFrame(pca.transform(scaled_data), columns=[f'PC{i+1}' for i in range(len(cols))])
# Get the PCA components
components = pca.transform(scaled_data)
# Get the eigenvectors
eigenvectors = pca.components_
# Create a DataFrame for better visualization
df_variance = pd.DataFrame({'Explained Variance': pca.explained_variance_ratio_})
print(df_variance)
df_variance.to_csv('variance explained.csv', index=False)
#==========================================================
# Robust calculation of eigenvalues
eigenvalues = pca.explained_variance_
percentages = pca.explained_variance_ratio_ * 100
cumulative_percent = np.cumsum(percentages)
# Create the table
eigen_table = pd.DataFrame({
'Number': range(1, len(cols) + 1),
'Eigenvalue': eigenvalues,
'Individual Percent': percentages,
'Cumulative Percent': cumulative_percent,
'Scree Bar Plot': ['|' * int(eigenvalue * 10) for eigenvalue in eigenvalues]
})
print('Eigenvalues Robust Estimation\\\\\\\\n', eigen_table)
eigen_table.to_csv('eigen_table.csv', index=False)
#=========================================================
import matplotlib.pyplot as plt
#Eigenvectors plots
# Transpose the DataFrame for better visualization
df_variance = df_variance.T
print(df_variance)
df_variance.to_csv('variance explained.csv', index=False)
# Create the table
eigen_table = eigen_table.T
print('Eigenvalues Robust Estimation\\\\\\\\\\\\\\\\n', eigen_table)
eigen_table.to_csv('eigen_table.csv', index=False)
# Print the eigenvectors
df_eigenvectors = pd.DataFrame(eigenvectors, columns=[f'PC{i+1}' for i in range(len(cols))], index=cols)
print(df_eigenvectors)
df_eigenvectors.to_csv('Eigenvectors.csv', index=False)
#=================================================================
# Component loadings & plots
# Print the component loadings
loadings = pca.components_.T
# Convert loadings to a DataFrame for better visualization
df_loadings = pd.DataFrame(loadings, columns=[f'PC{i+1}' for i in range(len(cols))], index=cols)
print(df_loadings)
df_loadings.to_csv('Component Loadings.csv', index=False)
# Create component loading plots
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
# Define a colormap
cmap = cm.get_cmap('viridis')
for i in range(len(cols)-1):
plt.figure(figsize=(10,7))
scatter = plt.scatter(df_loadings[f'PC{i+1}'], df_loadings[f'PC{i+2}'], c=np.arange(len(df_loadings)), cmap=cmap)
plt.xlabel(f'Loading {i+1}')
plt.ylabel(f'Loading {i+2}')
plt.title(f'Loading {i+1} vs Loading {i+2}')
# Create a legend with the original variable names
for j in range(len(df_loadings)):
plt.text(df_loadings[f'PC{i+1}'][j], df_loadings[f'PC{i+2}'][j], cols[j], verticalalignment='bottom', horizontalalignment='right')
plt.colorbar(scatter, label='Index of Variable')
plt.savefig(f'Component Loadings Plot {i+1} vs {i+2}.svg')
html_string = mpld3.fig_to_html(plt.gcf())
html_file.write(html_string)
plt.show()
#---------------------------------------------
# Convert numbers to "|"
df_loadings = df_loadings.applymap(lambda x: '|' * int(abs(x) * 10))
print(df_loadings)
df_loadings.to_csv('Component Loadings Bar plots.csv', index=False)
#=============================================
#Structure summary
# Create component structure summary table with PCs as columns
structure_table = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(len(cols))], index=cols)
print('Component Structure Summary:')
print(structure_table)
structure_table.to_csv('structure_table.csv', index=True)
# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# Update plotting code to use yellow from viridis for 0.6 and blue for -0.6
from matplotlib.colors import ListedColormap
cmap = ListedColormap(['#FDE724', '#440154'])
for col in structure_table.columns:
plt.figure(figsize=(10,7))
plt.barh(structure_table.index, structure_table[col], color=cmap(structure_table[col].values / structure_table[col].abs().max() * 0.5 + 0.5))
plt.xlabel('Loadings')
plt.title(f'Variable loadings for {col}')
html_string = mpld3.fig_to_html(plt.gcf())
html_file.write(html_string)
plt.show()
#============================================
# component loadings correlation heatmap
loadings = pca.components_
num_pc = pca.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = [pca_df.columns.values for i in list(range(1, num_pc+1))]
loadings_df = pca.components_
# get correlation matrix plot for loadings
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(loadings_df, annot=True, cmap='viridis') # Updated to use 'viridis' palette
ax.set_xticklabels(cols, rotation=0)
ax.set_yticklabels(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'], rotation=0, horizontalalignment='right')
ax.set_title('Comp-Load_TopLog')
plt.savefig('Load_CM_TopLog.svg', format='svg')
html_string = mpld3.fig_to_html(plt.gcf())
html_file.write(html_string)
plt.show()
# get PC scores
pca_scores = PCA().fit_transform(pca_data)
explained = pca.explained_variance_ratio_
# get 2D biplot
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,0], pca_scores[:,1], edgecolors='none', facecolors='#d8ebff') # Updated line
coeff = np.transpose(pca.components_[0:2, :])
n = coeff.shape[0]
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC1 ({:.2f}%)'.format(explained[0]*100), fontsize = 10)
plt.ylabel('PC2 ({:.2f}%)'.format(explained[1]*100), fontsize = 10)
plt.title('TopLog PC1-PC2')
plt.savefig('TopLog PC1-PC2.svg', format='svg')
html_string = mpld3.fig_to_html(fig)
html_file.write(html_string)
plt.show()
# get 2D biplot for PC1 vs PC3
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,0], pca_scores[:,2], edgecolors='none', facecolors='#d8ebff')
coeff = np.transpose(pca.components_[[0,2], :])
n = coeff.shape[0]
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC1 ({:.2f}%)'.format(explained[0]*100), fontsize = 10)
plt.ylabel('PC3 ({:.2f}%)'.format(explained[2]*100), fontsize = 10)
plt.title('TopLog PC1-PC3')
plt.savefig('TopLog PC1-PC3.svg', format='svg')
html_string = mpld3.fig_to_html(fig)
html_file.write(html_string)
plt.show()
# get 2D biplot for PC2 vs PC3
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,1], pca_scores[:,2], edgecolors='none', facecolors='#d8ebff')
coeff = np.transpose(pca.components_[[1,2], :])
n = coeff.shape[0]
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC2 ({:.2f}%)'.format(explained[1]*100), fontsize = 10)
plt.ylabel('PC3 ({:.2f}%)'.format(explained[2]*100), fontsize = 10)
plt.title('TopLog PC2-PC3')
plt.savefig('TopLog PC2-PC3.svg', format='svg')
html_string = mpld3.fig_to_html(fig)
html_file.write(html_string)
plt.show()
# get 2D biplot for PC3 vs PC4
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,2], pca_scores[:,3], edgecolors='none', facecolors='#d8ebff')
coeff = np.transpose(pca.components_[[2,3], :])
n = coeff.shape[0]
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC3 ({:.2f}%)'.format(explained[2]*100), fontsize = 10)
plt.ylabel('PC4 ({:.2f}%)'.format(explained[3]*100), fontsize = 10)
plt.title('TopLog PC3-PC4')
plt.savefig('TopLog PC3-PC4.svg', format='svg')
html_string = mpld3.fig_to_html(fig)
html_file.write(html_string)
plt.show()
html_file.close()