Why does this graph still show two outliers?

bennylenny · May 14, 2026, 1:25pm

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

List of constants

Sr_mol = 1e-6 # Moles of Strontium
Avogadro = 6.02e23 # Avogadro’s number - no. of atoms in one mole of a substance
Sr_nuclei = Sr_mol * Avogadro # Number of Strontium nuclei in one mole
est_lambda_Rb = 0.0005 # Estimate for Rb decay constant from previous study; in s^-1
est_lambda_Sr = 0.005 # Estimate for Sr decay constant from previous study; in s^-1

Files

nuclear_data_1 = Path(“Nuclear_data_1.csv”)
nuclear_data_2 = Path(“Nuclear_data_2.csv”)
nuclear_plot_filename = “nuclear_decay_fit.png”

Test if the files are readable

try:
with open(nuclear_data_1) as f1, open(nuclear_data_2) as f2:
line1 = f1.readline()
line2 = f2.readline()
except FileNotFoundError:
print(“Error: One or both of the files was not found. Please ensure ‘Nuclear_data_1.csv’ and ‘Nuclear_data_2.csv’ are in the right folder.”)

def activity_rb(t_sec: np.ndarray, lambda_Rb: float, lambda_Sr: float) → np.ndarray:
“”"
Activity of Rb-79 at time t is calculated; this is modelled on the decay of Sr-79.

t_sec = time in seconds
lambda_Rb = Decay constant for Rubidium (s^-1).
lambda_Sr = Decay constant for Strontium (s^-1).
"""
return Sr_nuclei * lambda_Rb * lambda_Sr / (lambda_Sr - lambda_Rb) * (np.exp(-lambda_Rb * t_sec) - np.exp(-lambda_Sr * t_sec))

DATA READING, CLEANING AND OUTLIER REMOVAL

“”"
The following code checks the file, ensures it has three columns, and removes
any columns marked with ‘error’ or implausible data, e.g. a time value of minus
one.

It also removes outliers using the standard deviation method, where values of
more than three standard deviations of the mean are “masked” (removed).

The abbreviation ‘df’ which appears throughout the code refers to the data frame.
The so defined ‘uncertainty’ refers to the uncertainty in the activity.
“”"
def read_and_clean_file(file_path: Path) → pd.DataFrame:
try:
df = pd.read_csv(file_path, engine=“python”, comment=“%”)
except pd.errors.EmptyDataError:
print(f"Error: The file ‘{file_path}’ is empty.“)
return pd.DataFrame()
except Exception as exc:
print(f"There was an error reading’{file_path}': {exc}”)
return pd.DataFrame()

if df.shape[1] < 3:
    print(f"Error: The file '{file_path}' does not have the three columns required (time, activity, and uncertainty).")
    return pd.DataFrame()

df = df.iloc[:, :3]
df.columns = ["time", "activity", "uncertainty"]
df = df.apply(pd.to_numeric, errors="coerce")
df.dropna(inplace=True)
df = df[(df["time"] >= 0) & (df["activity"] > 0) & (df["uncertainty"] > 0)]

return df

def remove_outliers_std(df: pd.DataFrame, threshold: float = 3.0) → pd.DataFrame:

if df.empty:
    return df

mean_vals = df[["activity", "time"]].mean()
std_vals = df[["activity", "time"]].std()

mask = ((np.abs(df["activity"] - mean_vals["activity"]) <= 3 * std_vals["activity"]) &
    (np.abs(df["time"] - mean_vals["time"]) <= 3 * std_vals["time"]))
return df[mask]

THE FIT

“”"
This part of the code converts the data to the right units, makes a calculation
for chi squared and the reduced chi squared, as well as a prediction for the model.

Time must be in hours.
Activity and uncertainty in activity must be in TBq.

‘Optimal_activity’ refers to the best fit values for the model.
‘Standard error’ refers to the standard deviation (uncertainty) estimates by the
model
‘p0’ refers to the initial parameters, the values for the decay constants determined
in a previous study
“”"
def fit_parameters(times_h: np.ndarray, activities_tbq: np.ndarray, uncertainties_tbq: np.ndarray):

time_sec = times_h * 3600.0
activity_bq = activities_tbq * 1e12
activity_uncertainty_bq = uncertainties_tbq * 1e12 

optimal_activity, covariance = curve_fit(activity_rb, time_sec, activity_bq, sigma=activity_uncertainty_bq, absolute_sigma=True, p0=[5e-4, 5e-3], maxfev=10000)
standard_error = np.sqrt(np.diag(covariance))
model = activity_rb(time_sec, *optimal_activity)
chi_squared = np.sum(((activity_bq - model) / activity_uncertainty_bq) ** 2)
chi_squared_red = chi_squared / (len(activity_bq) - 2)

return optimal_activity, standard_error, covariance, chi_squared, chi_squared_red

def half_life(lambda_value: float) → float:
“”"
Calculates the half-life from a given decay constant.

This is equivalent to the natural log of two divided by the decay constant
(probability per unit of time that a radioactive nucleus will decay).
"""
return np.log(2.0) / lambda_value

def half_life_uncertainty(lambda_value: float, sigma_lambda: float) → float:
“”"
Uncertainty calculation in the half-life given the decay constant and its uncertainty.
“”"
return (np.log(2.0) / lambda_value**2) * sigma_lambda

def predict_with_uncertainty(optimal_activity: np.ndarray, covariance: np.ndarray, t_min: float):
“”"
This section uses partial derivatives and the calculated best fit to make a
prediction for the activity.

It also uses the root-sum-square method to calculate and combine the uncertainties.
"""

t_sec = t_min * 60.0

lambda_Rb, lambda_Sr = optimal_activity

increment_Rb = lambda_Rb * 1e-6
increment_Sr = lambda_Sr * 1e-6

df_dRb = (activity_rb(np.array([t_sec]), lambda_Rb + increment_Rb, lambda_Sr)[0] - activity_rb(np.array([t_sec]), lambda_Rb - increment_Rb, lambda_Sr)[0]) / (2.0 * increment_Rb)
df_dSr = (activity_rb(np.array([t_sec]), lambda_Rb, lambda_Sr + increment_Sr)[0] - activity_rb(np.array([t_sec]), lambda_Rb, lambda_Sr - increment_Sr)[0]) / (2.0 * increment_Sr)

activity = activity_rb(np.array([t_sec]), lambda_Rb, lambda_Sr)[0]
uncertainty = np.sqrt((df_dRb * np.sqrt(covariance[0, 0]))**2 + (df_dSr * np.sqrt(covariance[1, 1]))**2)

return activity, uncertainty

PLOTTING

def make_publication_plot(times_h: np.ndarray, activities_tbq: np.ndarray, uncertainties_tbq: np.ndarray, optimal_activity, chi_squared_red: float):
“”"
Creates a plot for publication of the nuclear decay data.

The data is plotted with uncertainties and the non-linear model of the nuclear
decay is highlighted in red.

"""
plt.style.use("Solarize_Light2")
fig, ax = plt.subplots(figsize=(10, 7), dpi=400)

ax.errorbar(times_h, activities_tbq, yerr=uncertainties_tbq, fmt="x", ms=4, capsize=3,
            color="darkblue", ecolor="black", elinewidth=1, label="Data points")

t_plot = np.linspace(times_h.min(), times_h.max(), 600)
model_tbq = activity_rb(t_plot * 3600.0, *optimal_activity) / 1e12

ax.plot(t_plot, model_tbq, color="red", lw=2.2, label="Non-linear model of best-fit")

ax.set_xlabel("Time (hours)", fontsize=12)
ax.set_ylabel("Activity of 79Rb (TBq)", fontsize=12)
ax.set_title("Nuclear Decay of 79Sr into 79Rb", fontsize=14)

lambda_rb, lambda_sr = optimal_activity

text = f"λ_Rb = {lambda_rb:.3e} s^-1\nλ_Sr = {lambda_sr:.3e} s^-1\nχ^2_red = {chi_squared_red:.2f}"

ax.text(0.95, 0.95, text, transform=ax.transAxes, ha="right", va="top",
        fontsize=11, bbox=dict(facecolor="white", alpha=0.95))

ax.legend()
fig.tight_layout()

fig.savefig(nuclear_plot_filename)

plt.show()
plt.close(fig)

MAIN PART OF CODE

“”"
The main section of the code carries out an analysis of the nuclear decay data.

It summarises the above sections of checking, reading and cleaning the inputted
data files, converts everything to numpy, fits the theoretical model to the
experimental data, calculates the half-lives and activity including their
uncertainties.

‘Diff_unc’ is an abbreviation for the differential uncertainty.
“”"
def main() → None:

file1 = read_and_clean_file(nuclear_data_1)
file2 = read_and_clean_file(nuclear_data_2)
if file1.empty or file2.empty:
    print("Error: One or both of data files cannot be processed. Please check them before continuing.")
    return

combined_df = pd.concat([file1, file2], ignore_index=True)
combined_df.sort_values(by="time", inplace=True)

cleaned_df = remove_outliers_std(combined_df, threshold=3.0)
cleaned_df.reset_index(drop=True, inplace=True)

times_h = cleaned_df["time"].to_numpy()
activities_tbq = cleaned_df["activity"].to_numpy()
uncertainties_tbq = cleaned_df["uncertainty"].to_numpy()

optimal_activity, standard_error, covariance, chi_squared, chi_squared_red = fit_parameters(times_h, activities_tbq, uncertainties_tbq)
lambda_Rb, lambda_Sr = optimal_activity  
sigma_Rb, sigma_Sr = standard_error 

lambda_Rb_min = lambda_Rb * 60.0
lambda_Sr_min = lambda_Sr * 60.0
sigma_Rb_min = sigma_Rb * 60.0
sigma_Sr_min = sigma_Sr * 60.0

t_half_Rb_sec = half_life(lambda_Rb)
t_half_Sr_sec = half_life(lambda_Sr)
diff_unc_half_Rb_sec = half_life_uncertainty(lambda_Rb, sigma_Rb)
diff_unc_half_Sr_sec = half_life_uncertainty(lambda_Sr, sigma_Sr)

t_half_Rb_min = t_half_Rb_sec / 60.0
t_half_Sr_min = t_half_Sr_sec / 60.0
diff_unc_half_Rb_min = diff_unc_half_Rb_sec / 60.0
diff_unc_half_Sr_min = diff_unc_half_Sr_sec / 60.0

pred_bq, pred_sigma_bq = predict_with_uncertainty(optimal_activity, covariance, 90.0)
pred_tbq = pred_bq / 1e12
pred_sigma_tbq = pred_sigma_bq / 1e12

# SUMMARY
"""
A summary of all the results obtained by the code is printed here, which is
presented within a table.

It also classifies their uncertainties to determine whether the results are within
an acceptable range of the actual value.
"""

print("\n======================= SUMMARY TABLE ========================")
print(f"{'Quantity':<30} {'Value':>15} {'Uncertainty':>15}")
print("--------------------------------------------------------------")
print(f"{'λ_Rb (min^-1)':<30} {lambda_Rb_min:>15.3g} {sigma_Rb_min:>15.3g}")
print(f"{'λ_Sr (min^-1)':<30} {lambda_Sr_min:>15.3g} {sigma_Sr_min:>15.3g}")
print(f"{'t_1/2 Rb (min)':<30} {t_half_Rb_min:>15.3g} {diff_unc_half_Rb_min:>15.3g}")
print(f"{'t_1/2 Sr (min)':<30} {t_half_Sr_min:>15.3g} {diff_unc_half_Sr_min:>15.3g}")
print(f"{'Activity at 90 min (TBq)':<30} {pred_tbq:>15.3g} {pred_sigma_tbq:>15.3g}")
print(f"{'χ^2_red':<30} {chi_squared_red:>15.2f} {'N/A':>15}")
print("==============================================================\n")

print("\n======================================= DATA ANALYSIS =========================================")
print(f"{'Quantity':<30} {'Relative Uncertainty':>20} {'Status':>12} {'Explanation':>30}")
print("-----------------------------------------------------------------------------------------------")

def classify_uncertainty(value: float, sigma: float) -> tuple[float, str, str]:
    relative_uncertainty = abs(sigma / value)
    if relative_uncertainty < 0.10:
        return relative_uncertainty, "GOOD", "uncertainty < 10%"
    elif relative_uncertainty <= 0.20:
        return relative_uncertainty, "ACCEPTABLE", "uncertainty between 10–20%"
    else:
        return relative_uncertainty, "POOR", "uncertainty > 20%"

rel_Rb_lambda, status_Rb_lambda, expl_Rb_lambda = classify_uncertainty(lambda_Rb_min, sigma_Rb_min)
rel_Sr_lambda, status_Sr_lambda, expl_Sr_lambda = classify_uncertainty(lambda_Sr_min, sigma_Sr_min)
rel_Rb_half, status_Rb_half, expl_Rb_half = classify_uncertainty(t_half_Rb_min, diff_unc_half_Rb_min)
rel_Sr_half, status_Sr_half, expl_Sr_half = classify_uncertainty(t_half_Sr_min, diff_unc_half_Sr_min)
rel_pred_tbq, status_pred_tbq, expl_pred_tbq = classify_uncertainty(pred_tbq, pred_sigma_tbq)

print(f"{'λ_Rb (min^-1)':<30} {rel_Rb_lambda:>20.3f} {status_Rb_lambda:>12} {expl_Rb_lambda:>30}")
print(f"{'λ_Sr (min^-1)':<30} {rel_Sr_lambda:>20.3f} {status_Sr_lambda:>12} {expl_Sr_lambda:>30}")
print(f"{'t_1/2 Rb (min)':<30} {rel_Rb_half:>20.3f} {status_Rb_half:>12} {expl_Rb_half:>30}")
print(f"{'t_1/2 Sr (min)':<30} {rel_Sr_half:>20.3f} {status_Sr_half:>12} {expl_Sr_half:>30}")
print(f"{'Activity at 90 min (TBq)':<30} {rel_pred_tbq:>20.3f} {status_pred_tbq:>12} {expl_pred_tbq:>30}")
print(f"{'χ^2_red':<30} {'N/A':>20} {'N/A':>12} {'no uncertainty':>30}")
print("===============================================================================================\n")

make_publication_plot(times_h, activities_tbq, uncertainties_tbq, optimal_activity, chi_squared_red)
print(f"Plot saved as '{nuclear_plot_filename}'.")
print("Nuclear decay data analysis is complete.")

Ensure the code runs

if name == “main”:
main()

effigies · May 14, 2026, 1:47pm

A couple of quick resources for how to ask questions that could get the responses you’re looking for:

Even after fixing the formatting, this is a long script to expect others to digest. You should aim to have a MRE where you can demonstrate where there’s something behaving unexpectedly and ask targeted questions. Often the process of creating the MRE can help you answer the question.

This may also be a point where your professor/TA would be good to ask for feedback on this specific assignment. They are paid to teach you.