Fitting a FTIR curve using Linear regression

I am trying to fit FTIR spectra with other reference spectra.

Input file path spectra_path = ‘’

**dict_keys(['__header__', '__version__', '__globals__', 'AB', 'wh'])
Shape of 'AB': (3301, 117)
Shape of 'wh': (1, 2)**

wavenumber_path = ‘’

**dict_keys(['__header__', '__version__', '__globals__', 'wavenumber'])
Shape of 'wavenumber': (1, 3301)**

refnames_path = ‘’

**dict_keys(['__header__', '__version__', '__globals__', 'refnames'])
Shape of 'refnames': (117,)**

input_path = ‘’

dict_keys(['__header__', '__version__', '__globals__', 'ab', 'wn'])
Shape of 'ab': (467, 65537)
Shape of 'wn': (1, 467)
import scipy.io as sio
import pandas as pd
import itertools
from sklearn.linear_model import LinearRegression
from scipy.interpolate import interp1d
import numpy as np

def create_data_frame(spectra_file, refnames_file, wavenumber_file):
    spectra_mat = sio.loadmat(spectra_file)
    data = spectra_mat['AB'].T  
    data = data[:, ::-1]  
    wavenumber_mat = sio.loadmat(wavenumber_file)
    wavenumber = wavenumber_mat['wavenumber'][0]  
    wavenumber = wavenumber[::-1]  
    refnames_mat = sio.loadmat(refnames_file)
    names = refnames_mat['refnames'].flatten()  
    wav = wavenumber.tolist()
    names = np.char.array(names).tolist()
    final_data_frame = pd.DataFrame(data, columns=wav)
    final_data_frame['ID'] = names  # File ID or Reference names
    return final_data_frame

final_data_frame = create_data_frame(spectra_path, refnames_path, wavenumber_path)
print(final_data_frame.shape)
final_data_frame

ref_spectra_df = create_data_frame(spectra_path, refnames_path, wavenumber_path)
wavenumber_columns = [col for col in ref_spectra_df.columns if col != 'ID' and 900 <= float(col) <= 1800]
ref_spectra = ref_spectra_df[wavenumber_columns]  
print(ref_spectra.shape)

dat2 = sio.loadmat(refnames_path)
names = dat2['refnames'].flatten()  
names = [str(name[0]) for name in names] 
names = list(itertools.chain.from_iterable(names) if any(isinstance(i, list) for i in names) else names)
X = df_input 

ref_wavenumbers = np.linspace(1800, 900, ref_spectra.shape[1])
input_wavenumbers = np.linspace(1800, 900, X.shape[1])

interpolator = interp1d(ref_wavenumbers, ref_spectra, axis=1, kind='linear')
ref_spectra_interpolated = interpolator(input_wavenumbers)

regression_model = LinearRegression(positive=True)
regression_model.fit(ref_spectra_interpolated.T, X.T)
coefs = regression_model.coef_
coefs_df = pd.DataFrame(coefs.T, index=X.index)  
coefs_df.columns = names
coefs_df = coefs_df.reindex(coefs_df.mean().sort_values(ascending=False).index, axis=1)
norm = coefs_df.mean()
coefs_df = coefs_df / norm
coefs_df = coefs_df.dropna(axis=1, how='any')
coefs_df.to_csv('CLS fitting results - Serum.csv')

The above code ended up while extracting coefs.

X=df_input which is input dataframe. the size is (256,256,467) which i reorganized as (256*256,467) which makes it (65536,467), where 65536 corresponds to each pixel.

so i further modified the regression code to iterate over each pixel.

regression_model = LinearRegression(positive=True)
coefs = np.zeros((X.shape[0], ref_spectra_interpolated.shape[1])) 
for i in range(X.shape[0]):
    if isinstance(X, pd.DataFrame):
        y = X.iloc[i, :].values  
    else:
        y = X[i, :]
    regression_model.fit(ref_spectra_interpolated, y.reshape(-1, 1))
    coefs[i, :] = regression_model.coef_

ValueError                                Traceback (most recent call last)
<ipython-input-35-fdd8039b578c> in <cell line: 6>()
      9     else:
     10         y = X[i, :]
---> 11     regression_model.fit(ref_spectra_interpolated, y.reshape(-1, 1))
     12     coefs[i, :] = regression_model.coef_

3 frames
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    395     uniques = np.unique(lengths)
    396     if len(uniques) > 1:
--> 397         raise ValueError(
    398             "Found input variables with inconsistent numbers of samples: %r"
    399             % [int(l) for l in lengths]

ValueError: Found input variables with inconsistent numbers of samples: [117, 467]

Additional Information

Final DataFrame shape: (117, 3302)
   4000      3999      3998      3997      3996      3995      3994      3993  \
0   0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.010017   
1   0.0  0.076237  0.075786  0.092208  0.083860  0.089052  0.084329  0.093639   
2   0.0  0.006262  0.009665  0.011625  0.016771  0.020251  0.021349  0.016950   
3   0.0  0.073635  0.108422  0.166655  0.219240  0.290049  0.360189  0.441440   
4   0.0  0.293801  0.386299  0.399522  0.278570  0.125340  0.022092  0.000000   

       3992      3991  ...       708       707       706       705       704  \
0  0.006314  0.031430  ...  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.101848  0.128834  ...  0.000000  0.000000  0.000000  0.000000  0.000000   
2  0.010785  0.004299  ...  0.006163  0.004709  0.003217  0.001170  0.000319   
3  0.519642  0.607271  ...  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.001895  0.040333  ...  0.019329  0.022445  0.023159  0.017931  0.006998   

        703       702       701  700                            ID  
0  0.000000  0.000000  0.000000  0.0  ATP                           
1  0.000000  0.000000  0.000000  0.0  Acid-phosphatase              
2  0.000000  0.000000  0.000185  0.0  Actin                         
3  0.000000  0.000000  0.000000  0.0  Adenine                       
4  0.003938  0.004616  0.022361  0.0  Ala-Phe                       

[5 rows x 3302 columns]
Reference Spectra shape: (117, 901)
       1800      1799      1798      1797      1796      1795      1794  \
0  0.398656  0.400172  0.401489  0.404073  0.408325  0.414702  0.421624   
1  0.690701  0.688172  0.684635  0.681072  0.677156  0.673283  0.669346   
2  0.191147  0.197091  0.203515  0.210285  0.216416  0.223856  0.233328   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.090553  0.101870  0.105062  0.107536  0.110965  0.119128  0.127208   

       1793      1792      1791  ...      909       908       907       906   \
0  0.429084  0.437233  0.447926  ...  0.003007  0.003197  0.003463  0.003506   
1  0.666719  0.665631  0.666432  ...  0.002496  0.002457  0.002665  0.002789   
2  0.245261  0.257144  0.268808  ...  0.000177  0.000900  0.002059  0.002893   
3  0.000000  0.000000  0.000000  ...  0.055016  0.057460  0.060365  0.061447   
4  0.139671  0.156604  0.181457  ...  0.000000  0.000000  0.000000  0.000000   

       905       904       903       902       901       900   
0  0.003749  0.004070  0.004753  0.005364  0.006069  0.006608  
1  0.003047  0.003060  0.003115  0.002932  0.002834  0.002494  
2  0.003389  0.003392  0.003278  0.002821  0.002229  0.001372  
3  0.060328  0.056912  0.053430  0.050287  0.048376  0.046766  
4  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  

[5 rows x 901 columns]
Names length: 117
First few names: ['A', 'A', 'A', 'A', 'A']
Interpolated Reference Spectra shape: (117, 467)
Coefficients shape: (65537, 117)

plt.plot(input_wavenumbers, ref_spectra_interpolated[:, i], label=name)

Any suggestions or anyone handled similar data, appreciate your time and help.

I modified the above code and got an update here

import numpy as np
import pandas as pd
from scipy import io

input_mat = io.loadmat('[path to input mat file]')
input_spectra = input_mat['ab'].T
input_wavenumbers = np.linspace(1800, 900, input_spectra.shape[1])

ref_spectra_mat = io.loadmat('path to reference spectra mat file')
reference_spectra = ref_spectra_mat['AB'].T
wavenumber_mat = io.loadmat('path to wavenummber mat file')
reference_wavenumbers = wavenumber_mat['wavenumber'][0]
reference_df = pd.DataFrame(reference_spectra, columns=reference_wavenumbers)
input_df = pd.DataFrame(input_spectra, columns=input_wavenumbers)
matched_columns = {iw: reference_wavenumbers[np.argmin(np.abs(reference_wavenumbers - iw))] for iw in input_wavenumbers}
matched_reference_df = reference_df[list(matched_columns.values())]
matched_input_df = input_df.rename(columns=matched_columns)
print(f"Matched wavenumbers (input to reference): {matched_columns}")
matched_reference_df = matched_reference_df[matched_input_df.columns]
print(matched_reference_df)

X = matched_reference_df.to_numpy()
Y = matched_input_df.to_numpy()
coefficients = np.zeros((Y.shape[0], X.shape[0]))
for i in range(Y.shape[0]):
    coefs, _, _, _ = np.linalg.lstsq(X.T, Y[i, :], rcond=None)
    coefficients[i, :] = coefs
print("Coefficients shape:", coefficients.shape)

reconstructed_Y = np.dot(coefficients, X)
residuals = Y - reconstructed_Y
mse = np.mean(residuals**2, axis=1)
print("Overall Mean Squared Error:", np.mean(mse))


# log-likelihood
n = Y.shape[1]
log_likelihood = -n/2 * np.log(2 * np.pi * mse) - np.sum(residuals**2, axis=1) / (2 * mse)
print("Log Likelihood:", np.mean(log_likelihood))

log_likelihood = 1978.526696385699
mse = 0.004512451234982761

n = Y.shape[0]
k = X.shape[0]

print("Number of observations (n):", n)
print("Number of parameters (k):", k)
print("Log likelihood:", log_likelihood)

# Calculate BIC
BIC = k * np.log(n) - 2 * log_likelihood

print("BIC:", BIC)

However, I am not sure about the formula for Logliklihood.

from statistics import variance
import math

reconstructed_Y = np.dot(coefficients, X)
residuals = Y - reconstructed_Y
se = residuals**2

n = Y.shape[1]
log_likelihood = []

for i in range(se.shape[0]):
    var = variance(residuals[i, :])
    # Calculate the cumulative product and then take the log
    ll = np.log(np.cumprod((1 / np.sqrt(2 * np.pi * var)) * np.exp(-se[i, :] / (2 * var))))
    log_likelihood.append(ll)

print("Log Likelihood:", log_likelihood)

However, the result ends up with error such as undervalue or inf.
Any suggestions.