-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1,524 changed files
with
315 additions
and
414 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[theme] | ||
primaryColor = '#1c83e1' | ||
backgroundColor = '#ffffff' | ||
secondaryBackgroundColor = '#f0f2f6' | ||
textColor = '#000000' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
**Solar Farm Data Analysis FOR 10Academy KIM 0 Challenge** | ||
|
||
**Overview** | ||
This repository is the hub for a challenge centered around analyzing solar farm data from three countries: Benin, Sierra Leone, and Togo. The challenge is part of a selection process for a 12-week training program, where my skills in Data Engineering (DE), Financial Analytics (FA), and Machine Learning Engineering (MLE) will be put to the test. | ||
|
||
**Objective** | ||
The goal of this challenge is to explore and analyze the provided solar farm data to extract meaningful insights. Participants are expected to use data engineering techniques to clean and prepare the data, apply financial analytics to evaluate financial performance, and leverage machine learning to generate predictions and insights regarding the solar farms' operations. | ||
|
||
**Details of the Challenge** | ||
- **Data Source:** Datasets from solar farms in Benin, Sierra Leone, and Togo. | ||
- **Focus Areas:** | ||
- **Data Engineering (DE):** Involves cleaning, transforming, and preparing the data for analysis. | ||
- **Financial Analytics (FA):** Focuses on evaluating the financial metrics and performance of the solar farms. | ||
- **Machine Learning Engineering (MLE):** Involves developing models to predict and analyze outcomes. | ||
|
||
**Getting Started** | ||
1. **Clone the Repository:** | ||
```bash | ||
git clone https://github.com/AschalewMathewosDamtew/MoonLightEnergy.git | ||
``` | ||
2. **Navigate to the Project Directory:** | ||
```bash | ||
cd MoonLightEnergy | ||
``` | ||
3. **Set Up Your Environment:** Install the necessary dependencies listed in either `requirements.txt`. | ||
|
||
4. **Explore the Data:** The data needed for analysis is available in the `data` directory. Use these datasets for your exploration and analysis. | ||
|
||
**File Structure** | ||
- `data/` - Directory containing both raw and processed datasets. | ||
- `src/notebooks/` - Jupyter notebooks used for analysis and model development. | ||
- `scripts/` - Python scripts dedicated to data processing and analysis. | ||
- `reports/` - Folder for reports and visualizations generated during the analysis. | ||
- `README.md` - This file. |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import streamlit as st | ||
from data_processing import load_data, clean_and_prepare_data | ||
from utils import data_quality_check | ||
from plots import plot_time_series, plot_area, create_scatter_plot, create_correlation_analysis | ||
import pandas as pd | ||
|
||
# Load datasets | ||
datasets = load_data() | ||
|
||
# Streamlit UI | ||
st.title("Solar Radiation Data Analysis") | ||
|
||
# Sidebar for dataset selection | ||
dataset_name = st.sidebar.selectbox("Select Dataset", ("Benin", "Togo", "Sierra Leone")) | ||
df = datasets[dataset_name] | ||
|
||
# Display the dataset summary | ||
st.write(f"### {dataset_name} Dataset Summary") | ||
st.write(df.describe()) | ||
|
||
# Sidebar: Want to Clean Section | ||
clean_data = st.sidebar.checkbox("Want to Clean Data") | ||
|
||
if clean_data: | ||
# Data Quality Check Before Cleaning | ||
quality_results_before = data_quality_check(df) | ||
st.write("#### Data Quality Check Results (Before Cleaning)") | ||
st.write(pd.DataFrame(quality_results_before).T) | ||
|
||
# Clean Data | ||
df_cleaned = clean_and_prepare_data(df) | ||
|
||
# Data Quality Check After Cleaning | ||
quality_results_after = data_quality_check(df_cleaned) | ||
st.write("#### Data Quality Check Results (After Cleaning)") | ||
st.write(pd.DataFrame(quality_results_after).T) | ||
|
||
# Display cleaned data | ||
st.write(f"### {dataset_name} Cleaned Data") | ||
st.write(df_cleaned.head()) | ||
|
||
# Sidebar: Analysis Selection | ||
st.sidebar.write("### Which to Analyze?") | ||
analyze_uncleaned = st.sidebar.checkbox("Analyze Uncleaned Data") | ||
analyze_cleaned = st.sidebar.checkbox("Analyze Cleaned Data") | ||
|
||
# Determine which dataset to analyze | ||
df_to_analyze = None | ||
data_label = "" | ||
|
||
if analyze_uncleaned: | ||
df_to_analyze = df | ||
data_label = "Uncleaned Data" | ||
elif analyze_cleaned and clean_data: | ||
df_to_analyze = df_cleaned | ||
data_label = "Cleaned Data" | ||
|
||
# Display analysis options if either checkbox is selected | ||
if analyze_uncleaned or (analyze_cleaned and clean_data): | ||
st.sidebar.write(f"### Analysis Options for {data_label}") | ||
plot_area_selected = st.sidebar.checkbox("Area Plot") | ||
plot_time_series_selected = st.sidebar.checkbox("Time Series Plot") | ||
plot_scatter_selected = st.sidebar.checkbox("Scatter Plot") | ||
plot_correlation_selected = st.sidebar.checkbox("Correlation Analysis") | ||
|
||
# Perform selected analyses | ||
if plot_correlation_selected and df_to_analyze is not None: | ||
create_correlation_analysis(df_to_analyze, dataset_name) | ||
|
||
if plot_area_selected and df_to_analyze is not None: | ||
plot_area(df_to_analyze, dataset_name) | ||
|
||
if plot_scatter_selected and df_to_analyze is not None: | ||
create_scatter_plot(df_to_analyze) | ||
|
||
if plot_time_series_selected and df_to_analyze is not None: | ||
plot_time_series(df_to_analyze, dataset_name) | ||
else: | ||
st.warning("Please select either 'Analyze Uncleaned Data' or 'Analyze Cleaned Data'.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import os | ||
|
||
|
||
COLUMNS_TO_CHECK = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust'] | ||
|
||
|
||
def load_data(): | ||
base_path = os.path.dirname(os.path.abspath(__file__)) | ||
datasets = { | ||
"Benin": pd.read_csv(os.path.join(base_path, '../data/benin-malanville.csv')), | ||
"Togo": pd.read_csv(os.path.join(base_path, '../data/togo-dapaong_qc.csv')), | ||
"Sierra Leone": pd.read_csv(os.path.join(base_path, '../data/sierraleone-bumbuna.csv')) | ||
} | ||
return datasets | ||
|
||
def clean_data(df): | ||
df = df[(df[COLUMNS_TO_CHECK] >= 0).all(axis=1)] | ||
z_scores = np.abs((df[COLUMNS_TO_CHECK] - df[COLUMNS_TO_CHECK].mean()) / df[COLUMNS_TO_CHECK].std()) | ||
df = df[(z_scores < 3).all(axis=1)] | ||
return df | ||
|
||
def clean_and_prepare_data(df): | ||
df_cleaned = clean_data(df) | ||
df_cleaned['Timestamp'] = pd.to_datetime(df_cleaned['Timestamp']) | ||
df_cleaned.set_index('Timestamp', inplace=True) | ||
return df_cleaned |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
import streamlit as st | ||
|
||
def plot_time_series(df, dataset_name): | ||
fig, ax = plt.subplots(figsize=(14, 8)) | ||
df[['GHI', 'DNI', 'DHI', 'Tamb']].plot(ax=ax) | ||
plt.title(f'Time Series Analysis of GHI, DNI, DHI, and Tamb in {dataset_name}') | ||
st.pyplot(fig) | ||
|
||
import matplotlib.pyplot as plt | ||
import streamlit as st | ||
|
||
def plot_area(df, title, columns): | ||
try: | ||
# Check if any column contains both positive and negative values | ||
for col in columns: | ||
if df[col].min() < 0 and df[col].max() > 0: | ||
raise ValueError(f"Column '{col}' contains both positive and negative values, which is not allowed in an area plot.") | ||
|
||
# Create area plot | ||
fig, ax = plt.subplots() | ||
df[columns].plot(kind='area', ax=ax, alpha=0.5) | ||
ax.set_title(title) | ||
plt.xticks(rotation=45) | ||
plt.tight_layout() | ||
st.pyplot(fig) | ||
|
||
except ValueError as e: | ||
# Handle the ValueError and provide an appropriate message | ||
st.error(f"Error in plotting area chart: {e}") | ||
|
||
def create_scatter_plot(df): | ||
fig, ax = plt.subplots() | ||
sns.scatterplot(data=df, x='RH', y='Tamb', ax=ax) | ||
plt.title("Scatter Plot: Temperature (Tamb) vs Relative Humidity (RH)") | ||
st.pyplot(fig) | ||
|
||
def create_correlation_analysis(df, dataset_name): | ||
correlation = df[['RH', 'Tamb', 'TModA', 'TModB']].corr() | ||
st.write(f"### Correlation Analysis - {dataset_name}") | ||
st.write(correlation) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
def data_quality_check(df): | ||
COLUMNS_TO_CHECK = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust'] | ||
data_quality = { | ||
"Column": COLUMNS_TO_CHECK, | ||
"Missing Values": df[COLUMNS_TO_CHECK].isnull().sum().values, | ||
"Outliers": (np.abs((df[COLUMNS_TO_CHECK] - df[COLUMNS_TO_CHECK].mean()) / df[COLUMNS_TO_CHECK].std()) > 3).sum().values, | ||
"Incorrect Entries": (df[COLUMNS_TO_CHECK] < 0).sum().values | ||
} | ||
return pd.DataFrame(data_quality) |
Oops, something went wrong.