hi

AschalewMathewosDamtew · Aug 24, 2024 · e4c47d4 · e4c47d4
2 parents c15d38a + 346e1bf
commit e4c47d4
Show file tree

Hide file tree

Showing 1,524 changed files with 315 additions and 414 deletions.
diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -0,0 +1,5 @@
+[theme]
+primaryColor = '#1c83e1'
+backgroundColor = '#ffffff'
+secondaryBackgroundColor = '#f0f2f6'
+textColor = '#000000'
diff --git a/README.md b/README.md
@@ -0,0 +1,34 @@
+**Solar Farm Data Analysis FOR 10Academy KIM 0 Challenge**
+
+**Overview**
+This repository is the hub for a challenge centered around analyzing solar farm data from three countries: Benin, Sierra Leone, and Togo. The challenge is part of a selection process for a 12-week training program, where my skills in Data Engineering (DE), Financial Analytics (FA), and Machine Learning Engineering (MLE) will be put to the test.
+
+**Objective**
+The goal of this challenge is to explore and analyze the provided solar farm data to extract meaningful insights. Participants are expected to use data engineering techniques to clean and prepare the data, apply financial analytics to evaluate financial performance, and leverage machine learning to generate predictions and insights regarding the solar farms' operations.
+
+**Details of the Challenge**
+- **Data Source:** Datasets from solar farms in Benin, Sierra Leone, and Togo.
+- **Focus Areas:**
+  - **Data Engineering (DE):** Involves cleaning, transforming, and preparing the data for analysis.
+  - **Financial Analytics (FA):** Focuses on evaluating the financial metrics and performance of the solar farms.
+  - **Machine Learning Engineering (MLE):** Involves developing models to predict and analyze outcomes.
+
+**Getting Started**
+1. **Clone the Repository:**
+   ```bash
+   git clone https://github.com/AschalewMathewosDamtew/MoonLightEnergy.git
+   ```
+2. **Navigate to the Project Directory:**
+   ```bash
+   cd MoonLightEnergy
+   ```
+3. **Set Up Your Environment:** Install the necessary dependencies listed in either `requirements.txt`.
+
+4. **Explore the Data:** The data needed for analysis is available in the `data` directory. Use these datasets for your exploration and analysis.
+
+**File Structure**
+- `data/` - Directory containing both raw and processed datasets.
+- `src/notebooks/` - Jupyter notebooks used for analysis and model development.
+- `scripts/` - Python scripts dedicated to data processing and analysis.
+- `reports/` - Folder for reports and visualizations generated during the analysis.
+- `README.md` - This file.
diff --git a/app/__pycache__/data_processing.cpython-312.pyc b/app/__pycache__/data_processing.cpython-312.pyc
diff --git a/app/__pycache__/plots.cpython-312.pyc b/app/__pycache__/plots.cpython-312.pyc
diff --git a/app/__pycache__/utils.cpython-312.pyc b/app/__pycache__/utils.cpython-312.pyc
diff --git a/app/app.py b/app/app.py
@@ -0,0 +1,79 @@
+import streamlit as st
+from data_processing import load_data, clean_and_prepare_data
+from utils import data_quality_check
+from plots import plot_time_series, plot_area, create_scatter_plot, create_correlation_analysis
+import pandas as pd
+
+# Load datasets
+datasets = load_data()
+
+# Streamlit UI
+st.title("Solar Radiation Data Analysis")
+
+# Sidebar for dataset selection
+dataset_name = st.sidebar.selectbox("Select Dataset", ("Benin", "Togo", "Sierra Leone"))
+df = datasets[dataset_name]
+
+# Display the dataset summary
+st.write(f"### {dataset_name} Dataset Summary")
+st.write(df.describe())
+
+# Sidebar: Want to Clean Section
+clean_data = st.sidebar.checkbox("Want to Clean Data")
+
+if clean_data:
+    # Data Quality Check Before Cleaning
+    quality_results_before = data_quality_check(df)
+    st.write("#### Data Quality Check Results (Before Cleaning)")
+    st.write(pd.DataFrame(quality_results_before).T)
+
+    # Clean Data
+    df_cleaned = clean_and_prepare_data(df)
+
+    # Data Quality Check After Cleaning
+    quality_results_after = data_quality_check(df_cleaned)
+    st.write("#### Data Quality Check Results (After Cleaning)")
+    st.write(pd.DataFrame(quality_results_after).T)
+
+    # Display cleaned data
+    st.write(f"### {dataset_name} Cleaned Data")
+    st.write(df_cleaned.head())
+
+# Sidebar: Analysis Selection
+st.sidebar.write("### Which to Analyze?")
+analyze_uncleaned = st.sidebar.checkbox("Analyze Uncleaned Data")
+analyze_cleaned = st.sidebar.checkbox("Analyze Cleaned Data")
+
+# Determine which dataset to analyze
+df_to_analyze = None
+data_label = ""
+
+if analyze_uncleaned:
+    df_to_analyze = df
+    data_label = "Uncleaned Data"
+elif analyze_cleaned and clean_data:
+    df_to_analyze = df_cleaned
+    data_label = "Cleaned Data"
+
+# Display analysis options if either checkbox is selected
+if analyze_uncleaned or (analyze_cleaned and clean_data):
+    st.sidebar.write(f"### Analysis Options for {data_label}")
+    plot_area_selected = st.sidebar.checkbox("Area Plot")
+    plot_time_series_selected = st.sidebar.checkbox("Time Series Plot")
+    plot_scatter_selected = st.sidebar.checkbox("Scatter Plot")
+    plot_correlation_selected = st.sidebar.checkbox("Correlation Analysis")
+
+    # Perform selected analyses
+    if plot_correlation_selected and df_to_analyze is not None:
+        create_correlation_analysis(df_to_analyze, dataset_name)
+
+    if plot_area_selected and df_to_analyze is not None:
+        plot_area(df_to_analyze, dataset_name)
+
+    if plot_scatter_selected and df_to_analyze is not None:
+        create_scatter_plot(df_to_analyze)
+
+    if plot_time_series_selected and df_to_analyze is not None:
+        plot_time_series(df_to_analyze, dataset_name)
+else:
+    st.warning("Please select either 'Analyze Uncleaned Data' or 'Analyze Cleaned Data'.")
diff --git a/app/data_processing.py b/app/data_processing.py
@@ -0,0 +1,28 @@
+import pandas as pd
+import numpy as np
+import os
+
+
+COLUMNS_TO_CHECK = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
+
+
+def load_data():
+    base_path = os.path.dirname(os.path.abspath(__file__))
+    datasets = {
+        "Benin": pd.read_csv(os.path.join(base_path, '../data/benin-malanville.csv')),
+        "Togo": pd.read_csv(os.path.join(base_path, '../data/togo-dapaong_qc.csv')),
+        "Sierra Leone": pd.read_csv(os.path.join(base_path, '../data/sierraleone-bumbuna.csv'))
+    }
+    return datasets
+
+def clean_data(df):
+    df = df[(df[COLUMNS_TO_CHECK] >= 0).all(axis=1)]
+    z_scores = np.abs((df[COLUMNS_TO_CHECK] - df[COLUMNS_TO_CHECK].mean()) / df[COLUMNS_TO_CHECK].std())
+    df = df[(z_scores < 3).all(axis=1)]
+    return df
+
+def clean_and_prepare_data(df):
+    df_cleaned = clean_data(df)
+    df_cleaned['Timestamp'] = pd.to_datetime(df_cleaned['Timestamp'])
+    df_cleaned.set_index('Timestamp', inplace=True)
+    return df_cleaned
diff --git a/app/main.py b/app/main.py
@@ -16,11 +16,12 @@
 st.title("Solar Radiation Data Analysis")
 
 # Load data
-df = da.load_data('data/benin-malanville.csv')
+df = da.load_data('../data/benin-malanville.csv')
+
 
 # Sidebar
 option = st.sidebar.selectbox("Select Analysis", ("Summary Statistics", "Time Series Analysis", 
-                                                  "Correlation Analysis", "Wind Analysis", 
+                                                  "Correlation Analysis", "Create Wind Plot", 
                                                   "Temperature Analysis", "Histograms", 
                                                   "Z-Score Analysis", "Bubble Chart"))
 
@@ -31,8 +32,8 @@
     da.time_series_analysis(df)
 elif option == "Correlation Analysis":
     da.correlation_analysis(df)
-elif option == "Wind Analysis":
-    da.wind_analysis(df)
+elif option == "Create Wind Analysis":
+    da.create_polar_plot(df, 'Wind Direction')
 elif option == "Temperature Analysis":
     da.temperature_analysis(df)
 elif option == "Histograms":

diff --git a/app/plots.py b/app/plots.py
@@ -0,0 +1,42 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+import streamlit as st
+
+def plot_time_series(df, dataset_name):
+    fig, ax = plt.subplots(figsize=(14, 8))
+    df[['GHI', 'DNI', 'DHI', 'Tamb']].plot(ax=ax)
+    plt.title(f'Time Series Analysis of GHI, DNI, DHI, and Tamb in {dataset_name}')
+    st.pyplot(fig)
+
+import matplotlib.pyplot as plt
+import streamlit as st
+
+def plot_area(df, title, columns):
+    try:
+        # Check if any column contains both positive and negative values
+        for col in columns:
+            if df[col].min() < 0 and df[col].max() > 0:
+                raise ValueError(f"Column '{col}' contains both positive and negative values, which is not allowed in an area plot.")
+
+        # Create area plot
+        fig, ax = plt.subplots()
+        df[columns].plot(kind='area', ax=ax, alpha=0.5)
+        ax.set_title(title)
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        st.pyplot(fig)
+
+    except ValueError as e:
+        # Handle the ValueError and provide an appropriate message
+        st.error(f"Error in plotting area chart: {e}")
+
+def create_scatter_plot(df):
+    fig, ax = plt.subplots()
+    sns.scatterplot(data=df, x='RH', y='Tamb', ax=ax)
+    plt.title("Scatter Plot: Temperature (Tamb) vs Relative Humidity (RH)")
+    st.pyplot(fig)
+
+def create_correlation_analysis(df, dataset_name):
+    correlation = df[['RH', 'Tamb', 'TModA', 'TModB']].corr()
+    st.write(f"### Correlation Analysis - {dataset_name}")
+    st.write(correlation)
diff --git a/app/utils.py b/app/utils.py
@@ -0,0 +1,12 @@
+import pandas as pd
+import numpy as np
+
+def data_quality_check(df):
+    COLUMNS_TO_CHECK = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
+    data_quality = {
+        "Column": COLUMNS_TO_CHECK,
+        "Missing Values": df[COLUMNS_TO_CHECK].isnull().sum().values,
+        "Outliers": (np.abs((df[COLUMNS_TO_CHECK] - df[COLUMNS_TO_CHECK].mean()) / df[COLUMNS_TO_CHECK].std()) > 3).sum().values,
+        "Incorrect Entries": (df[COLUMNS_TO_CHECK] < 0).sum().values
+    }
+    return pd.DataFrame(data_quality)