I’m currently working on a project called Project WATT.WIZARD.This Python script reads energy consumption data from a CSV file, cleans it by handling missing values and outliers, and then analyzes the data to generate a report on usage patterns. It builds a linear regression model to predict future energy consumption based on the time of day and day of the week. The script then forecasts consumption for a user-specified number of future hours and plots both historical and predicted data on a graph, making it easy to visualize trends and predictions.
Here are the main topics covered by the code:
Data Import and Parsing: Reading and preparing time series data from a CSV file. Data Cleaning: Handling missing values and outliers in the energy consumption data. Data Analysis: Generating reports on energy consumption patterns (e.g., highest and lowest usage). Model Building: Creating a linear regression model to predict future energy consumption. Future Consumption Prediction: Forecasting energy usage for specified future hours. Data Visualization: Plotting historical and predicted energy consumption on a graph. Main Execution Flow: Orchestrating the overall process, including user input, data processing, analysis, prediction, and visualization.
I used linear regression as my machine learning model in this case, but the output graph I received showed negative values and an odd graph for future prediction.
What else should I use as my machine learning model?
Let me know why it comes here and how I can solve it.
Here is my code.
import sys
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def collect_EnergyData(file_path):
try:
data = pd.read_csv(file_path, parse_dates=['timestamps'])
data = data.set_index('timestamps')
return data
except FileNotFoundError:
print(f"System Error! The system was unable to locate the file path: {file_path}")
except pd.errors.EmptyDataError:
print(f"File Error! This file ({file_path}) is empty!")
except Exception as expt:
print(f"Error reading the CSV File: {expt}")
return None
def clean_data(data):
if 'consumption' in data.columns:
data['consumption'] = data['consumption'].clip(lower=0)
mean_value = data['consumption'].mean()
data['consumption'] = data['consumption'].fillna(mean_value)
data.loc[data['consumption'] > 10000, 'consumption'] = mean_value
Q1 = data['consumption'].quantile(0.25)
Q3 = data['consumption'].quantile(0.75)
IQR = Q3 -Q1
upper_threshold = Q3 + 1.5 * IQR
data['consumption'] = np.where(data['consumption'] > upper_threshold, mean_value, data['consumption'])
return data
else:
print("Column 'consumption' not found in the data. ")
return data
def analyze_consumption(data):
hourly_consumption = data['consumption'].resample('h').sum()
print("\n<< Daily Consumption Report >>")
print("\n")
print(hourly_consumption)
print("\n")
max_hourly_consumption = hourly_consumption.max()
max_hours = hourly_consumption.idxmax()
print(f"1. Highest Usage is on: {max_hourly_consumption} kWh on {max_hours}")
print("\n")
min_daily_consumption = hourly_consumption.min()
min_hours = hourly_consumption.idxmin()
print(f"2. Lowest Usage is on: {min_daily_consumption} kWh on {min_hours}")
print("\n")
total_consumption = hourly_consumption.sum()
print(f"3. Total Consumption of the day: {total_consumption} kWh")
print("\n")
max_percentage = (max_hourly_consumption / total_consumption) * 100
print(f"4. Maximum Day Consumption as a percentage of Total: {max_percentage:.3f}%")
def predict_demand(data):
data['hour'] = data.index.hour
data['days_of_week'] = data.index.dayofweek
data['month'] = data.index.month
data['week_of_year'] = data.index.isocalendar().week
features = ['hour', 'days_of_week', 'month', 'week_of_year']
X = data[features]
y = data['consumption']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=130)
model = RandomForestRegressor(n_estimators=300,max_depth=20,min_samples_split=2, min_samples_leaf=1, random_state=130)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\n")
print(f"5. Mean Squared Error Value is: {mse}")
print("\n")
print(f"6. Mean Absolute Error Value is: {mae}")
print("\n")
print(f"7. R Squared Value is: {r2}")
return model
def predict_future_consumption(model, cleaned_data):
last_timestamp = cleaned_data.index[-1]
print("\n")
while True:
try:
future_hours = int(input("Please input the number of hours you wish to forecast. The system is ready to calculate your future energy values? "))
if future_hours <= 0:
raise ValueError("Number of hours must be a positive value.")
break
except ValueError as expt:
print(f"Invalid input: {expt}. Please enter a positive integer. ")
future_data = []
for i in range(1, future_hours + 1):
next_hour = last_timestamp + pd.Timedelta(hours=i)
future_data.append({
'hour': next_hour.hour,
'days_of_week': next_hour.dayofweek,
'month': next_hour.month,
'week_of_year': next_hour.isocalendar().week
})
future_data = pd.DataFrame(future_data)
future_data.index = [last_timestamp + pd.Timedelta(hours=i) for i in range(1, future_hours + 1)]
future_consumption = model.predict(future_data)
future_consumption = np.clip(future_consumption, a_min=0, a_max=None)
print("\n")
print("8. Predicted Consumption for next hours: ")
print()
for time, consumption in zip(future_data.index, future_consumption):
print(f" {time}: {consumption:.2f} kWh")
return future_consumption, future_data
def suggest_methods_to_reduce(high_usage_periods):
print("\n<< Here are some tips to reduce the Higher Values : Suggested by WATT.WIZARD >>")
print("\nGeneral Tip: When using minimum-usage energy, try to store any extra and use it during peak hours, and make sure that co-workers don't misuse energy.")
print()
for time in high_usage_periods.index:
if 00 <= time.hour < 6:
print(f" {time}: Early in the morning is when you use the most energy. Therefore, you can use the following reduction techniques! ")
print(" -> Schedule non-essential process for off-peak hours.")
print(" -> Try to use energy-efficient lighting with motion sensors.")
print(" -> Optimize HVAC system for minimal nighttime operations.")
elif 6 <= time.hour < 11:
print(f" {time}: You consume the most energy in the Morning.Therefore, you can use the following reduction techniques!")
print(" -> Implement staggered start times for equipment.")
print(" -> Utilize natural lighting where possible.")
print(" -> Encourage energy-conscious behavior among early shift workers.")
elif 11 <= time.hour < 16:
print(f" {time}: You consume the most energy in the afternoon.Therefore, you can use the following reduction techniques!")
print(" -> Conduct regular maintenance to ensure equipment efficiency.")
print(" -> Use smart power strips to reduce phantom energy consumption.")
print(" -> Optimize production schedules to avoid simultaneous operation of high-energy equipment.")
print(" -> Utilize natural lighting where possible.")
elif 16 <= time.hour < 19:
print(f" {time}: You consume the most energy in the Evening.Therefore, you can use the following reduction techniques!")
print(" -> Implement demand response strategies during grid peak times.")
print(" -> Use energy storage systems to offset high demand.")
print(" -> Encourage telecommuting or flexible hours to reduce facility energy use.")
else:
print(f" {time}: You consume the most energy in the Night.Therefore, you can use the following reduction techniques!")
print(" -> Automate shutdown procedures for non-essential equipment.")
print(" -> Use timer controls for exterior lighting.")
print(" -> Conduct energy-intensive processes during off-peak hours if possible.")
print("\n")
def plot_consumption(data, future_data, future_consumption):
plt.figure(figsize=(16, 8))
plt.plot(data.index, data['consumption'], label='Historical Data')
plt.plot(future_data.index, future_consumption, 'r-.', label='Predicted Data')
plt.title('Energy Consumption Graph: Based on Given and the Predicted Values\n Dinalofcl - WATT.WIZARD')
plt.xlabel('Time')
plt.ylabel('Consumption in kWh')
plt.legend()
plt.grid(True)
plt.show()
def main():
print("\n")
file_path = input("Hello User! Please enter the full path to the CSV file: ")
raw_data = collect_EnergyData(file_path)
if raw_data is not None:
cleaned_data = clean_data(raw_data)
analyze_consumption(cleaned_data)
model = predict_demand(cleaned_data)
future_consumption, future_data = predict_future_consumption(model, cleaned_data)
high_usage_threshold = np.percentile(future_consumption, 85)
high_usage_periods = future_data[future_consumption > high_usage_threshold]
suggest_methods_to_reduce(high_usage_periods)
print("\n")
print("We are creating the graphical user interface. Hold On....", end='', flush=True)
time.sleep(3)
sys.stdout.write('\r' + ' ' * 60 + '\r')
sys.stdout.flush()
plot_consumption(cleaned_data, future_data, future_consumption)
else:
print("Failed to collect and process data.")
if __name__ == "__main__":
main()