# Fitting a FTIR curve using Linear regression

I am trying to fit FTIR spectra with other reference spectra.

Input file path spectra_path = ‘’

``````**dict_keys(['__header__', '__version__', '__globals__', 'AB', 'wh'])
Shape of 'AB': (3301, 117)
Shape of 'wh': (1, 2)**
``````

wavenumber_path = ‘’

``````**dict_keys(['__header__', '__version__', '__globals__', 'wavenumber'])
Shape of 'wavenumber': (1, 3301)**
``````

refnames_path = ‘’

``````**dict_keys(['__header__', '__version__', '__globals__', 'refnames'])
Shape of 'refnames': (117,)**
``````

input_path = ‘’

``````dict_keys(['__header__', '__version__', '__globals__', 'ab', 'wn'])
Shape of 'ab': (467, 65537)
Shape of 'wn': (1, 467)
``````
``````import scipy.io as sio
import pandas as pd
import itertools
from sklearn.linear_model import LinearRegression
from scipy.interpolate import interp1d
import numpy as np

def create_data_frame(spectra_file, refnames_file, wavenumber_file):
spectra_mat = sio.loadmat(spectra_file)
data = spectra_mat['AB'].T
data = data[:, ::-1]
wavenumber_mat = sio.loadmat(wavenumber_file)
wavenumber = wavenumber_mat['wavenumber'][0]
wavenumber = wavenumber[::-1]
refnames_mat = sio.loadmat(refnames_file)
names = refnames_mat['refnames'].flatten()
wav = wavenumber.tolist()
names = np.char.array(names).tolist()
final_data_frame = pd.DataFrame(data, columns=wav)
final_data_frame['ID'] = names  # File ID or Reference names
return final_data_frame

final_data_frame = create_data_frame(spectra_path, refnames_path, wavenumber_path)
print(final_data_frame.shape)
final_data_frame

ref_spectra_df = create_data_frame(spectra_path, refnames_path, wavenumber_path)
wavenumber_columns = [col for col in ref_spectra_df.columns if col != 'ID' and 900 <= float(col) <= 1800]
ref_spectra = ref_spectra_df[wavenumber_columns]
print(ref_spectra.shape)

dat2 = sio.loadmat(refnames_path)
names = dat2['refnames'].flatten()
names = [str(name[0]) for name in names]
names = list(itertools.chain.from_iterable(names) if any(isinstance(i, list) for i in names) else names)
``````
``````X = df_input

ref_wavenumbers = np.linspace(1800, 900, ref_spectra.shape[1])
input_wavenumbers = np.linspace(1800, 900, X.shape[1])

interpolator = interp1d(ref_wavenumbers, ref_spectra, axis=1, kind='linear')
ref_spectra_interpolated = interpolator(input_wavenumbers)

regression_model = LinearRegression(positive=True)
regression_model.fit(ref_spectra_interpolated.T, X.T)
coefs = regression_model.coef_
coefs_df = pd.DataFrame(coefs.T, index=X.index)
coefs_df.columns = names
coefs_df = coefs_df.reindex(coefs_df.mean().sort_values(ascending=False).index, axis=1)
norm = coefs_df.mean()
coefs_df = coefs_df / norm
coefs_df = coefs_df.dropna(axis=1, how='any')
coefs_df.to_csv('CLS fitting results - Serum.csv')
``````

The above code ended up while extracting coefs.

X=df_input which is input dataframe. the size is (256,256,467) which i reorganized as (256*256,467) which makes it (65536,467), where 65536 corresponds to each pixel.

so i further modified the regression code to iterate over each pixel.

``````regression_model = LinearRegression(positive=True)
coefs = np.zeros((X.shape[0], ref_spectra_interpolated.shape[1]))
for i in range(X.shape[0]):
if isinstance(X, pd.DataFrame):
y = X.iloc[i, :].values
else:
y = X[i, :]
regression_model.fit(ref_spectra_interpolated, y.reshape(-1, 1))
coefs[i, :] = regression_model.coef_

ValueError                                Traceback (most recent call last)
<ipython-input-35-fdd8039b578c> in <cell line: 6>()
9     else:
10         y = X[i, :]
---> 11     regression_model.fit(ref_spectra_interpolated, y.reshape(-1, 1))
12     coefs[i, :] = regression_model.coef_

3 frames
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
395     uniques = np.unique(lengths)
396     if len(uniques) > 1:
--> 397         raise ValueError(
398             "Found input variables with inconsistent numbers of samples: %r"
399             % [int(l) for l in lengths]

ValueError: Found input variables with inconsistent numbers of samples: [117, 467]
``````

Additional Information

``````Final DataFrame shape: (117, 3302)
4000      3999      3998      3997      3996      3995      3994      3993  \
0   0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.010017
1   0.0  0.076237  0.075786  0.092208  0.083860  0.089052  0.084329  0.093639
2   0.0  0.006262  0.009665  0.011625  0.016771  0.020251  0.021349  0.016950
3   0.0  0.073635  0.108422  0.166655  0.219240  0.290049  0.360189  0.441440
4   0.0  0.293801  0.386299  0.399522  0.278570  0.125340  0.022092  0.000000

3992      3991  ...       708       707       706       705       704  \
0  0.006314  0.031430  ...  0.000000  0.000000  0.000000  0.000000  0.000000
1  0.101848  0.128834  ...  0.000000  0.000000  0.000000  0.000000  0.000000
2  0.010785  0.004299  ...  0.006163  0.004709  0.003217  0.001170  0.000319
3  0.519642  0.607271  ...  0.000000  0.000000  0.000000  0.000000  0.000000
4  0.001895  0.040333  ...  0.019329  0.022445  0.023159  0.017931  0.006998

703       702       701  700                            ID
0  0.000000  0.000000  0.000000  0.0  ATP
1  0.000000  0.000000  0.000000  0.0  Acid-phosphatase
2  0.000000  0.000000  0.000185  0.0  Actin
3  0.000000  0.000000  0.000000  0.0  Adenine
4  0.003938  0.004616  0.022361  0.0  Ala-Phe

[5 rows x 3302 columns]
Reference Spectra shape: (117, 901)
1800      1799      1798      1797      1796      1795      1794  \
0  0.398656  0.400172  0.401489  0.404073  0.408325  0.414702  0.421624
1  0.690701  0.688172  0.684635  0.681072  0.677156  0.673283  0.669346
2  0.191147  0.197091  0.203515  0.210285  0.216416  0.223856  0.233328
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
4  0.090553  0.101870  0.105062  0.107536  0.110965  0.119128  0.127208

1793      1792      1791  ...      909       908       907       906   \
0  0.429084  0.437233  0.447926  ...  0.003007  0.003197  0.003463  0.003506
1  0.666719  0.665631  0.666432  ...  0.002496  0.002457  0.002665  0.002789
2  0.245261  0.257144  0.268808  ...  0.000177  0.000900  0.002059  0.002893
3  0.000000  0.000000  0.000000  ...  0.055016  0.057460  0.060365  0.061447
4  0.139671  0.156604  0.181457  ...  0.000000  0.000000  0.000000  0.000000

905       904       903       902       901       900
0  0.003749  0.004070  0.004753  0.005364  0.006069  0.006608
1  0.003047  0.003060  0.003115  0.002932  0.002834  0.002494
2  0.003389  0.003392  0.003278  0.002821  0.002229  0.001372
3  0.060328  0.056912  0.053430  0.050287  0.048376  0.046766
4  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000

[5 rows x 901 columns]
Names length: 117
First few names: ['A', 'A', 'A', 'A', 'A']
Interpolated Reference Spectra shape: (117, 467)
Coefficients shape: (65537, 117)
``````

`plt.plot(input_wavenumbers, ref_spectra_interpolated[:, i], label=name)`

Any suggestions or anyone handled similar data, appreciate your time and help.

I modified the above code and got an update here

``````import numpy as np
import pandas as pd
from scipy import io

input_mat = io.loadmat('[path to input mat file]')
input_spectra = input_mat['ab'].T
input_wavenumbers = np.linspace(1800, 900, input_spectra.shape[1])

ref_spectra_mat = io.loadmat('path to reference spectra mat file')
reference_spectra = ref_spectra_mat['AB'].T
wavenumber_mat = io.loadmat('path to wavenummber mat file')
reference_wavenumbers = wavenumber_mat['wavenumber'][0]
reference_df = pd.DataFrame(reference_spectra, columns=reference_wavenumbers)
input_df = pd.DataFrame(input_spectra, columns=input_wavenumbers)
matched_columns = {iw: reference_wavenumbers[np.argmin(np.abs(reference_wavenumbers - iw))] for iw in input_wavenumbers}
matched_reference_df = reference_df[list(matched_columns.values())]
matched_input_df = input_df.rename(columns=matched_columns)
print(f"Matched wavenumbers (input to reference): {matched_columns}")
matched_reference_df = matched_reference_df[matched_input_df.columns]
print(matched_reference_df)

X = matched_reference_df.to_numpy()
Y = matched_input_df.to_numpy()
coefficients = np.zeros((Y.shape[0], X.shape[0]))
for i in range(Y.shape[0]):
coefs, _, _, _ = np.linalg.lstsq(X.T, Y[i, :], rcond=None)
coefficients[i, :] = coefs
print("Coefficients shape:", coefficients.shape)

reconstructed_Y = np.dot(coefficients, X)
residuals = Y - reconstructed_Y
mse = np.mean(residuals**2, axis=1)
print("Overall Mean Squared Error:", np.mean(mse))

# log-likelihood
n = Y.shape[1]
log_likelihood = -n/2 * np.log(2 * np.pi * mse) - np.sum(residuals**2, axis=1) / (2 * mse)
print("Log Likelihood:", np.mean(log_likelihood))

log_likelihood = 1978.526696385699
mse = 0.004512451234982761

n = Y.shape[0]
k = X.shape[0]

print("Number of observations (n):", n)
print("Number of parameters (k):", k)
print("Log likelihood:", log_likelihood)

# Calculate BIC
BIC = k * np.log(n) - 2 * log_likelihood

print("BIC:", BIC)
``````

However, I am not sure about the formula for Logliklihood.

``````from statistics import variance
import math

reconstructed_Y = np.dot(coefficients, X)
residuals = Y - reconstructed_Y
se = residuals**2

n = Y.shape[1]
log_likelihood = []

for i in range(se.shape[0]):
var = variance(residuals[i, :])
# Calculate the cumulative product and then take the log
ll = np.log(np.cumprod((1 / np.sqrt(2 * np.pi * var)) * np.exp(-se[i, :] / (2 * var))))
log_likelihood.append(ll)

print("Log Likelihood:", log_likelihood)

``````

However, the result ends up with error such as undervalue or inf.
Any suggestions.