Topsoil | Notion

Original data

Logged values

Preprocessing tests

Kaiser-Meyer-Olkin (KMO) Test

from factor_analyzer import FactorAnalyzer
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np

# create a dataset
data = pd.read_csv('TopLog.csv')
data = data.drop('Sample', axis=1)

# scale data
scaler = RobustScaler()
data_scaled = scaler.fit_transform(data)

from factor_analyzer.factor_analyzer import calculate_kmo

# Perform KMO test
kmo_all,kmo_model=calculate_kmo(data_scaled)

print("KMO Test Results: ")
print("KMO overall: ", kmo_model)

KMO Test Results: 
KMO overall:  0.7981399209156622

PCA Loadings Bootstrapping

from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.utils import resample
import pandas as pd
import numpy as np

# initialize PCA
pca = PCA()

# create a dataset
data = pd.read_csv('TopLog.csv')
data = data.drop('Sample', axis=1)

# scale data
scaler = RobustScaler()
data_scaled = scaler.fit_transform(data)

# fit PCA on original data
orig_loadings = pca.fit(data_scaled).components_

# bootstrap
n_iterations = 1000
boot_loadings = []

for i in range(n_iterations):
    # bootstrap sample
    sample = resample(data_scaled)

    # fit PCA on bootstrap sample
    loadings = pca.fit(sample).components_

    # store result
    boot_loadings.append(loadings)

print("Bootstrap loadings: ", boot_loadings)

from scipy import stats

# transpose the boot_loadings_df for ANOVA
transposed = boot_loadings_df.T

# perform ANOVA
f_val, p_val = stats.f_oneway(*[group for name, group in transposed.iteritems()])

print("F-value: ", f_val)
print("p-value: ", p_val)

if p_val < 0.05:
    print("There is a significant difference among the bootstrap results.")
else:
    print("There is no significant difference among the bootstrap results.")

F-value:  0.34224044699631684
p-value:  0.9999999999999999
There is no significant difference among the bootstrap results.

PCA Results

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler

import mpld3

# Open the HTML file for writing
html_file = open('TopLog_results.html', 'w')

# Load the data
data = pd.read_csv('TopLog.csv')

# Select the columns to use for the PCA
cols = ['Cu', 'As', 'Zn', 'Mn', 'Pb', 'Ni', 'Cr']
pca_data = data[cols]

# Scale the data
scaler = RobustScaler()
scaled_data = scaler.fit_transform(pca_data)

# Perform the PCA
pca = PCA(n_components=len(cols))
pca.fit(scaled_data)

pca_df = pd.DataFrame(pca.transform(scaled_data), columns=[f'PC{i+1}' for i in range(len(cols))])

# Get the PCA components
components = pca.transform(scaled_data)

# Get the eigenvectors
eigenvectors = pca.components_

# Create a DataFrame for better visualization
df_variance = pd.DataFrame({'Explained Variance': pca.explained_variance_ratio_})

print(df_variance)
df_variance.to_csv('variance explained.csv', index=False)

#==========================================================

# Robust calculation of eigenvalues
eigenvalues = pca.explained_variance_
percentages = pca.explained_variance_ratio_ * 100
cumulative_percent = np.cumsum(percentages)

# Create the table
eigen_table = pd.DataFrame({
    'Number': range(1, len(cols) + 1),
    'Eigenvalue': eigenvalues,
    'Individual Percent': percentages,
    'Cumulative Percent': cumulative_percent,
    'Scree Bar Plot': ['|' * int(eigenvalue * 10) for eigenvalue in eigenvalues]
})

print('Eigenvalues Robust Estimation\\\\\\\\n', eigen_table)
eigen_table.to_csv('eigen_table.csv', index=False)

#=========================================================

import matplotlib.pyplot as plt

#Eigenvectors plots

# Transpose the DataFrame for better visualization
df_variance = df_variance.T
print(df_variance)
df_variance.to_csv('variance explained.csv', index=False)

# Create the table
eigen_table = eigen_table.T
print('Eigenvalues Robust Estimation\\\\\\\\\\\\\\\\n', eigen_table)
eigen_table.to_csv('eigen_table.csv', index=False)

# Print the eigenvectors
df_eigenvectors = pd.DataFrame(eigenvectors, columns=[f'PC{i+1}' for i in range(len(cols))], index=cols)
print(df_eigenvectors)
df_eigenvectors.to_csv('Eigenvectors.csv', index=False)

#=================================================================
# Component loadings & plots

# Print the component loadings
loadings = pca.components_.T
# Convert loadings to a DataFrame for better visualization
df_loadings = pd.DataFrame(loadings, columns=[f'PC{i+1}' for i in range(len(cols))], index=cols)
print(df_loadings)
df_loadings.to_csv('Component Loadings.csv', index=False)

    
# Create component loading plots
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# Define a colormap
cmap = cm.get_cmap('viridis')

for i in range(len(cols)-1):
    plt.figure(figsize=(10,7))
    scatter = plt.scatter(df_loadings[f'PC{i+1}'], df_loadings[f'PC{i+2}'], c=np.arange(len(df_loadings)), cmap=cmap)
    plt.xlabel(f'Loading {i+1}')
    plt.ylabel(f'Loading {i+2}')
    plt.title(f'Loading {i+1} vs Loading {i+2}')

    # Create a legend with the original variable names
    for j in range(len(df_loadings)):
        plt.text(df_loadings[f'PC{i+1}'][j], df_loadings[f'PC{i+2}'][j], cols[j], verticalalignment='bottom', horizontalalignment='right')

    plt.colorbar(scatter, label='Index of Variable')
    plt.savefig(f'Component Loadings Plot {i+1} vs {i+2}.svg')
    html_string = mpld3.fig_to_html(plt.gcf())  
    html_file.write(html_string)
    plt.show()

#---------------------------------------------

# Convert numbers to "|"
df_loadings = df_loadings.applymap(lambda x: '|' * int(abs(x) * 10))

print(df_loadings)
df_loadings.to_csv('Component Loadings Bar plots.csv', index=False)

#=============================================
#Structure summary

# Create component structure summary table with PCs as columns
structure_table = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(len(cols))], index=cols)
print('Component Structure Summary:')
print(structure_table)
structure_table.to_csv('structure_table.csv', index=True)

# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Update plotting code to use yellow from viridis for 0.6 and blue for -0.6
from matplotlib.colors import ListedColormap
cmap = ListedColormap(['#FDE724', '#440154'])

for col in structure_table.columns:
    plt.figure(figsize=(10,7))
    plt.barh(structure_table.index, structure_table[col], color=cmap(structure_table[col].values / structure_table[col].abs().max() * 0.5 + 0.5))
    plt.xlabel('Loadings')
    plt.title(f'Variable loadings for {col}')
    html_string = mpld3.fig_to_html(plt.gcf())  
    html_file.write(html_string)
    plt.show()

#============================================
# component loadings correlation heatmap
loadings = pca.components_
num_pc = pca.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = [pca_df.columns.values for i in list(range(1, num_pc+1))]
loadings_df = pca.components_

# get correlation matrix plot for loadings
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(loadings_df, annot=True, cmap='viridis')  # Updated to use 'viridis' palette
ax.set_xticklabels(cols, rotation=0)
ax.set_yticklabels(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'], rotation=0, horizontalalignment='right')
ax.set_title('Comp-Load_TopLog')
plt.savefig('Load_CM_TopLog.svg', format='svg')
html_string = mpld3.fig_to_html(plt.gcf()) 
html_file.write(html_string)
plt.show()

# get PC scores
pca_scores = PCA().fit_transform(pca_data)

explained = pca.explained_variance_ratio_

# get 2D biplot
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,0], pca_scores[:,1], edgecolors='none', facecolors='#d8ebff')  # Updated line
coeff = np.transpose(pca.components_[0:2, :])
n = coeff.shape[0]
for i in range(n):
    plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
    plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC1 ({:.2f}%)'.format(explained[0]*100), fontsize = 10)
plt.ylabel('PC2 ({:.2f}%)'.format(explained[1]*100), fontsize = 10)
plt.title('TopLog PC1-PC2')
plt.savefig('TopLog PC1-PC2.svg', format='svg')
html_string = mpld3.fig_to_html(fig)  
html_file.write(html_string)
plt.show()

# get 2D biplot for PC1 vs PC3
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,0], pca_scores[:,2], edgecolors='none', facecolors='#d8ebff')
coeff = np.transpose(pca.components_[[0,2], :])
n = coeff.shape[0]
for i in range(n):
    plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
    plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC1 ({:.2f}%)'.format(explained[0]*100), fontsize = 10)
plt.ylabel('PC3 ({:.2f}%)'.format(explained[2]*100), fontsize = 10)
plt.title('TopLog PC1-PC3')
plt.savefig('TopLog PC1-PC3.svg', format='svg')
html_string = mpld3.fig_to_html(fig)  
html_file.write(html_string)
plt.show()

# get 2D biplot for PC2 vs PC3
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,1], pca_scores[:,2], edgecolors='none', facecolors='#d8ebff')
coeff = np.transpose(pca.components_[[1,2], :])
n = coeff.shape[0]
for i in range(n):
    plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
    plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC2 ({:.2f}%)'.format(explained[1]*100), fontsize = 10)
plt.ylabel('PC3 ({:.2f}%)'.format(explained[2]*100), fontsize = 10)
plt.title('TopLog PC2-PC3')
plt.savefig('TopLog PC2-PC3.svg', format='svg')
html_string = mpld3.fig_to_html(fig)  
html_file.write(html_string)
plt.show()

# get 2D biplot for PC3 vs PC4
fig, ax = plt.subplots()
ax.scatter(pca_scores[:,2], pca_scores[:,3], edgecolors='none', facecolors='#d8ebff')
coeff = np.transpose(pca.components_[[2,3], :])
n = coeff.shape[0]
for i in range(n):
    plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = '#731a47',alpha = 0.5)
    plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, cols[i], color = '#731a47', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC3 ({:.2f}%)'.format(explained[2]*100), fontsize = 10)
plt.ylabel('PC4 ({:.2f}%)'.format(explained[3]*100), fontsize = 10)
plt.title('TopLog PC3-PC4')
plt.savefig('TopLog PC3-PC4.svg', format='svg')
html_string = mpld3.fig_to_html(fig)  
html_file.write(html_string)
plt.show()

html_file.close()

Component Loadings Plot 1 vs 2.svg

Component Loadings Plot 2 vs 3.svg

Component Loadings Plot 3 vs 4.svg

Component Loadings Plot 4 vs 5.svg

Component Loadings Plot 5 vs 6.svg

Component Loadings Plot 6 vs 7.svg