Skip to content

Commit

Permalink
hi
Browse files Browse the repository at this point in the history
  • Loading branch information
AschalewMathewosDamtew committed Aug 24, 2024
2 parents c15d38a + 346e1bf commit e4c47d4
Show file tree
Hide file tree
Showing 1,524 changed files with 315 additions and 414 deletions.
5 changes: 5 additions & 0 deletions .streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[theme]
primaryColor = '#1c83e1'
backgroundColor = '#ffffff'
secondaryBackgroundColor = '#f0f2f6'
textColor = '#000000'
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
**Solar Farm Data Analysis FOR 10Academy KIM 0 Challenge**

**Overview**
This repository is the hub for a challenge centered around analyzing solar farm data from three countries: Benin, Sierra Leone, and Togo. The challenge is part of a selection process for a 12-week training program, where my skills in Data Engineering (DE), Financial Analytics (FA), and Machine Learning Engineering (MLE) will be put to the test.

**Objective**
The goal of this challenge is to explore and analyze the provided solar farm data to extract meaningful insights. Participants are expected to use data engineering techniques to clean and prepare the data, apply financial analytics to evaluate financial performance, and leverage machine learning to generate predictions and insights regarding the solar farms' operations.

**Details of the Challenge**
- **Data Source:** Datasets from solar farms in Benin, Sierra Leone, and Togo.
- **Focus Areas:**
- **Data Engineering (DE):** Involves cleaning, transforming, and preparing the data for analysis.
- **Financial Analytics (FA):** Focuses on evaluating the financial metrics and performance of the solar farms.
- **Machine Learning Engineering (MLE):** Involves developing models to predict and analyze outcomes.

**Getting Started**
1. **Clone the Repository:**
```bash
git clone https://github.com/AschalewMathewosDamtew/MoonLightEnergy.git
```
2. **Navigate to the Project Directory:**
```bash
cd MoonLightEnergy
```
3. **Set Up Your Environment:** Install the necessary dependencies listed in either `requirements.txt`.

4. **Explore the Data:** The data needed for analysis is available in the `data` directory. Use these datasets for your exploration and analysis.

**File Structure**
- `data/` - Directory containing both raw and processed datasets.
- `src/notebooks/` - Jupyter notebooks used for analysis and model development.
- `scripts/` - Python scripts dedicated to data processing and analysis.
- `reports/` - Folder for reports and visualizations generated during the analysis.
- `README.md` - This file.
Binary file added app/__pycache__/data_processing.cpython-312.pyc
Binary file not shown.
Binary file added app/__pycache__/plots.cpython-312.pyc
Binary file not shown.
Binary file added app/__pycache__/utils.cpython-312.pyc
Binary file not shown.
79 changes: 79 additions & 0 deletions app/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import streamlit as st
from data_processing import load_data, clean_and_prepare_data
from utils import data_quality_check
from plots import plot_time_series, plot_area, create_scatter_plot, create_correlation_analysis
import pandas as pd

# Load datasets
datasets = load_data()

# Streamlit UI
st.title("Solar Radiation Data Analysis")

# Sidebar for dataset selection
dataset_name = st.sidebar.selectbox("Select Dataset", ("Benin", "Togo", "Sierra Leone"))
df = datasets[dataset_name]

# Display the dataset summary
st.write(f"### {dataset_name} Dataset Summary")
st.write(df.describe())

# Sidebar: Want to Clean Section
clean_data = st.sidebar.checkbox("Want to Clean Data")

if clean_data:
# Data Quality Check Before Cleaning
quality_results_before = data_quality_check(df)
st.write("#### Data Quality Check Results (Before Cleaning)")
st.write(pd.DataFrame(quality_results_before).T)

# Clean Data
df_cleaned = clean_and_prepare_data(df)

# Data Quality Check After Cleaning
quality_results_after = data_quality_check(df_cleaned)
st.write("#### Data Quality Check Results (After Cleaning)")
st.write(pd.DataFrame(quality_results_after).T)

# Display cleaned data
st.write(f"### {dataset_name} Cleaned Data")
st.write(df_cleaned.head())

# Sidebar: Analysis Selection
st.sidebar.write("### Which to Analyze?")
analyze_uncleaned = st.sidebar.checkbox("Analyze Uncleaned Data")
analyze_cleaned = st.sidebar.checkbox("Analyze Cleaned Data")

# Determine which dataset to analyze
df_to_analyze = None
data_label = ""

if analyze_uncleaned:
df_to_analyze = df
data_label = "Uncleaned Data"
elif analyze_cleaned and clean_data:
df_to_analyze = df_cleaned
data_label = "Cleaned Data"

# Display analysis options if either checkbox is selected
if analyze_uncleaned or (analyze_cleaned and clean_data):
st.sidebar.write(f"### Analysis Options for {data_label}")
plot_area_selected = st.sidebar.checkbox("Area Plot")
plot_time_series_selected = st.sidebar.checkbox("Time Series Plot")
plot_scatter_selected = st.sidebar.checkbox("Scatter Plot")
plot_correlation_selected = st.sidebar.checkbox("Correlation Analysis")

# Perform selected analyses
if plot_correlation_selected and df_to_analyze is not None:
create_correlation_analysis(df_to_analyze, dataset_name)

if plot_area_selected and df_to_analyze is not None:
plot_area(df_to_analyze, dataset_name)

if plot_scatter_selected and df_to_analyze is not None:
create_scatter_plot(df_to_analyze)

if plot_time_series_selected and df_to_analyze is not None:
plot_time_series(df_to_analyze, dataset_name)
else:
st.warning("Please select either 'Analyze Uncleaned Data' or 'Analyze Cleaned Data'.")
28 changes: 28 additions & 0 deletions app/data_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
import numpy as np
import os


COLUMNS_TO_CHECK = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']


def load_data():
base_path = os.path.dirname(os.path.abspath(__file__))
datasets = {
"Benin": pd.read_csv(os.path.join(base_path, '../data/benin-malanville.csv')),
"Togo": pd.read_csv(os.path.join(base_path, '../data/togo-dapaong_qc.csv')),
"Sierra Leone": pd.read_csv(os.path.join(base_path, '../data/sierraleone-bumbuna.csv'))
}
return datasets

def clean_data(df):
df = df[(df[COLUMNS_TO_CHECK] >= 0).all(axis=1)]
z_scores = np.abs((df[COLUMNS_TO_CHECK] - df[COLUMNS_TO_CHECK].mean()) / df[COLUMNS_TO_CHECK].std())
df = df[(z_scores < 3).all(axis=1)]
return df

def clean_and_prepare_data(df):
df_cleaned = clean_data(df)
df_cleaned['Timestamp'] = pd.to_datetime(df_cleaned['Timestamp'])
df_cleaned.set_index('Timestamp', inplace=True)
return df_cleaned
9 changes: 5 additions & 4 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
st.title("Solar Radiation Data Analysis")

# Load data
df = da.load_data('data/benin-malanville.csv')
df = da.load_data('../data/benin-malanville.csv')


# Sidebar
option = st.sidebar.selectbox("Select Analysis", ("Summary Statistics", "Time Series Analysis",
"Correlation Analysis", "Wind Analysis",
"Correlation Analysis", "Create Wind Plot",
"Temperature Analysis", "Histograms",
"Z-Score Analysis", "Bubble Chart"))

Expand All @@ -31,8 +32,8 @@
da.time_series_analysis(df)
elif option == "Correlation Analysis":
da.correlation_analysis(df)
elif option == "Wind Analysis":
da.wind_analysis(df)
elif option == "Create Wind Analysis":
da.create_polar_plot(df, 'Wind Direction')
elif option == "Temperature Analysis":
da.temperature_analysis(df)
elif option == "Histograms":
Expand Down
42 changes: 42 additions & 0 deletions app/plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st

def plot_time_series(df, dataset_name):
fig, ax = plt.subplots(figsize=(14, 8))
df[['GHI', 'DNI', 'DHI', 'Tamb']].plot(ax=ax)
plt.title(f'Time Series Analysis of GHI, DNI, DHI, and Tamb in {dataset_name}')
st.pyplot(fig)

import matplotlib.pyplot as plt
import streamlit as st

def plot_area(df, title, columns):
try:
# Check if any column contains both positive and negative values
for col in columns:
if df[col].min() < 0 and df[col].max() > 0:
raise ValueError(f"Column '{col}' contains both positive and negative values, which is not allowed in an area plot.")

# Create area plot
fig, ax = plt.subplots()
df[columns].plot(kind='area', ax=ax, alpha=0.5)
ax.set_title(title)
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(fig)

except ValueError as e:
# Handle the ValueError and provide an appropriate message
st.error(f"Error in plotting area chart: {e}")

def create_scatter_plot(df):
fig, ax = plt.subplots()
sns.scatterplot(data=df, x='RH', y='Tamb', ax=ax)
plt.title("Scatter Plot: Temperature (Tamb) vs Relative Humidity (RH)")
st.pyplot(fig)

def create_correlation_analysis(df, dataset_name):
correlation = df[['RH', 'Tamb', 'TModA', 'TModB']].corr()
st.write(f"### Correlation Analysis - {dataset_name}")
st.write(correlation)
12 changes: 12 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd
import numpy as np

def data_quality_check(df):
COLUMNS_TO_CHECK = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
data_quality = {
"Column": COLUMNS_TO_CHECK,
"Missing Values": df[COLUMNS_TO_CHECK].isnull().sum().values,
"Outliers": (np.abs((df[COLUMNS_TO_CHECK] - df[COLUMNS_TO_CHECK].mean()) / df[COLUMNS_TO_CHECK].std()) > 3).sum().values,
"Incorrect Entries": (df[COLUMNS_TO_CHECK] < 0).sum().values
}
return pd.DataFrame(data_quality)
Loading

0 comments on commit e4c47d4

Please sign in to comment.