diff --git a/notebooks/EDA analysis.ipynb b/notebooks/EDA analysis.ipynb deleted file mode 100644 index 358249f7..00000000 --- a/notebooks/EDA analysis.ipynb +++ /dev/null @@ -1,578 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import Libraries and Load Datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Importing Libraries\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from scipy.stats import ttest_rel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Load the Datasets\n", - "benin = pd.read_csv('../data/benin-malanville.csv')\n", - "togo = pd.read_csv('../data/togo-dapaong_qc.csv')\n", - "sierraleone = pd.read_csv('../data/sierraleone-bumbuna.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Summary Statistics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Summary Statistics For Benin" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"\\nBenin Summary Statistics:\")\n", - "# Calculate summary statistics for Benin dataset\n", - "benin.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Summary Statistics for Sierra Leone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"\\nSierra Leone Summary Statistics:\")\n", - "# Calculate summary statistics for Sierra Leone dataset\n", - "sierraleone.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Summary Statistics for Togo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"\\nTogo Summary Statistics:\")\n", - "# Calculate summary statistics for Togo dataset\n", - "togo.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Quality Check and Cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "# Load datasets\n", - "datasets = {\n", - "\n", - " \"Benin\": pd.read_csv('../data/benin-malanville.csv'),\n", - " \"Togo\": pd.read_csv('../data/togo-dapaong_qc.csv'),\n", - " \"Sierra Leone\": pd.read_csv('../data/sierraleone-bumbuna.csv')\n", - "}\n", - "\n", - "# Columns to check for missing values, outliers, or incorrect entries\n", - "COLUMNS_TO_CHECK = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']\n", - "\n", - "def data_quality_check(df, columns):\n", - " \"\"\"\n", - " Check for missing values, outliers, and incorrect entries in the specified columns of a DataFrame.\n", - " \n", - " Parameters:\n", - " df (DataFrame): The DataFrame to check.\n", - " columns (list): List of columns to check.\n", - " \n", - " Returns:\n", - " dict: A dictionary containing the count of missing values, outliers, and incorrect entries per column.\n", - " \"\"\"\n", - " return {\n", - " \"missing_values\": df[columns].isnull().sum(),\n", - " \"outliers\": (np.abs((df[columns] - df[columns].mean()) / df[columns].std()) > 3).sum(),\n", - " \"incorrect_entries\": (df[columns] < 0).sum()\n", - " }\n", - "\n", - "\n", - "def remove_incorrect_entries(df, columns):\n", - " \"\"\"\n", - " Remove rows with incorrect entries (negative values) from the DataFrame.\n", - " \n", - " Parameters:\n", - " df (DataFrame): The DataFrame to clean.\n", - " columns (list): List of columns to check for negative values.\n", - " \n", - " Returns:\n", - " DataFrame: The cleaned DataFrame.\n", - " \"\"\"\n", - " return df[(df[columns] >= 0).all(axis=1)]\n", - "\n", - "\n", - "def remove_outliers_zscore(df, columns, threshold=3):\n", - " \"\"\"\n", - " Remove outliers from the DataFrame using the Z-score method.\n", - " \n", - " Parameters:\n", - " df (DataFrame): The DataFrame to clean.\n", - " columns (list): List of columns to check for outliers.\n", - " threshold (float): The Z-score threshold to identify outliers.\n", - " \n", - " Returns:\n", - " DataFrame: The cleaned DataFrame with outliers removed.\n", - " \"\"\"\n", - " z_scores = np.abs((df[columns] - df[columns].mean()) / df[columns].std())\n", - " return df[(z_scores < threshold).all(axis=1)]\n", - "\n", - "\n", - "\n", - "def clean_and_save_dataset(name, df):\n", - " # Initial data quality check\n", - " quality_results = data_quality_check(df, COLUMNS_TO_CHECK)\n", - " print(f\"\\n{name} Data Quality Check:\")\n", - " for key, value in quality_results.items():\n", - " print(f\"{key.capitalize().replace('_', ' ')}:\\n\", value)\n", - "\n", - " # Remove incorrect entries\n", - " df_cleaned = remove_incorrect_entries(df, COLUMNS_TO_CHECK)\n", - "\n", - " # Remove outliers\n", - " df_cleaned = remove_outliers_zscore(df_cleaned, COLUMNS_TO_CHECK)\n", - "\n", - " # Save cleaned dataset\n", - " cleaned_file_name = f'{name.lower().replace(\" \", \"_\")}_cleaned.csv'\n", - " df_cleaned.to_csv(cleaned_file_name, index=False)\n", - " print(f\"Cleaned dataset saved as {cleaned_file_name}\")\n", - "\n", - " # Shape comparison\n", - " original_shape = df.shape\n", - " cleaned_shape = df_cleaned.shape\n", - " print(f\"{name} Data Shape (Original vs Cleaned): {original_shape} -> {cleaned_shape}\")\n", - "\n", - " # Final check for any remaining incorrect entries\n", - " final_incorrect_entries = check_incorrect_entries(df_cleaned, COLUMNS_TO_CHECK)\n", - " print(f\"\\n{name} Remaining Incorrect Entries After Cleaning:\\n\", final_incorrect_entries)\n", - " \n", - " return df_cleaned\n", - " \n", - "def check_incorrect_entries(df, columns):\n", - " \"\"\"\n", - " Check for any remaining incorrect (negative) entries in the DataFrame.\n", - " \n", - " Parameters:\n", - " df (DataFrame): The DataFrame to check.\n", - " columns (list): List of columns to check.\n", - " \n", - " Returns:\n", - " Series: The count of negative values per column.\n", - " \"\"\"\n", - " return (df[columns] < 0).sum()\n", - "\n", - "# Process each dataset\n", - "cleaned_datasets = {}\n", - "for name, df in datasets.items():\n", - " cleaned_datasets[name] = clean_and_save_dataset(name, df)\n", - "\n", - "# Access the cleaned dataset for each\n", - "sierraleone_cleaned = cleaned_datasets['Sierra Leone']\n", - "benin_cleaned = cleaned_datasets['Benin']\n", - "togo_cleaned = cleaned_datasets['Togo']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Temperature Analaysis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Temp Analysis for Benin" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Correlation analysis\n", - "correlation_cleaned = benin_cleaned[['RH', 'Tamb', 'TModA', 'TModB']].corr()\n", - "print('Correlation analysis - Benin (Cleaned Data)')\n", - "print(correlation_cleaned)\n", - "\n", - "# Scatter plot of Temperature vs RH\n", - "sns.scatterplot(data=benin_cleaned, x='RH', y='Tamb')\n", - "plt.title('Temperature (Tamb) vs Relative Humidity (RH) - Benin (Cleaned Data)')\n", - "plt.show()\n", - "\n", - "# Analysis of Solar Radiation and Temperature using the cleaned dataset\n", - "correlation_solar_cleaned = benin_cleaned[['GHI', 'DNI', 'DHI', 'Tamb', 'TModA', 'TModB']].corr()\n", - "print(\"Correlation Analysis - Benin (Cleaned Data):\")\n", - "print(correlation_solar_cleaned)\n", - "\n", - "# Scatter plot of Solar Radiation (GHI) vs Temperature (Tamb) using the cleaned dataset\n", - "sns.scatterplot(data=benin_cleaned, x='GHI', y='Tamb')\n", - "plt.title('Global Horizontal Irradiance (GHI) vs Temperature (Tamb) - Benin (Cleaned Data)')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Temp Analysis for Togo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Analysis of Temperature and Relative Humidity using the cleaned dataset\n", - "correlation_togo_cleaned = togo_cleaned[['RH', 'Tamb', 'TModA', 'TModB']].corr()\n", - "print(\"Correlation Analysis - Togo (Cleaned Data):\")\n", - "print(correlation_togo_cleaned)\n", - "\n", - "# Scatter plot of Temperature vs RH using the cleaned dataset\n", - "sns.scatterplot(data=togo_cleaned, x='RH', y='Tamb')\n", - "plt.title('Temperature (Tamb) vs Relative Humidity (RH) - Togo (Cleaned Data)')\n", - "plt.show()\n", - "\n", - "# Analysis of Solar Radiation and Temperature using the cleaned dataset\n", - "correlation_solar_togo_cleaned = togo_cleaned[['GHI', 'DNI', 'DHI', 'Tamb', 'TModA', 'TModB']].corr()\n", - "print(\"Solar Radiation and Temperature Correlation - Togo (Cleaned Data):\")\n", - "print(correlation_solar_togo_cleaned)\n", - "\n", - "# Scatter plot of Solar Radiation (GHI) vs Temperature (Tamb) using the cleaned dataset\n", - "sns.scatterplot(data=togo_cleaned, x='GHI', y='Tamb')\n", - "plt.title('Global Horizontal Irradiance (GHI) vs Temperature (Tamb) - Togo (Cleaned Data)')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Temp Analysis for Sierra Leone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Analysis of Temperature and Relative Humidity using the cleaned dataset\n", - "correlation_sierraleone_cleaned = sierraleone_cleaned[['RH', 'Tamb', 'TModA', 'TModB']].corr()\n", - "print(\"Correlation Analysis - Sierra Leone (Cleaned Data):\")\n", - "print(correlation_sierraleone_cleaned)\n", - "\n", - "# Scatter plot of Temperature vs RH using the cleaned dataset\n", - "sns.scatterplot(data=sierraleone_cleaned, x='RH', y='Tamb')\n", - "plt.title('Temperature (Tamb) vs Relative Humidity (RH) - Sierra Leone (Cleaned Data)')\n", - "plt.show()\n", - "\n", - "# Analysis of Solar Radiation and Temperature using the cleaned dataset\n", - "correlation_solar_sierraleone_cleaned = sierraleone_cleaned[['GHI', 'DNI', 'DHI', 'Tamb', 'TModA', 'TModB']].corr()\n", - "print(\"Solar Radiation and Temperature Correlation - Sierra Leone (Cleaned Data):\")\n", - "print(correlation_solar_sierraleone_cleaned)\n", - "\n", - "# Scatter plot of Solar Radiation (GHI) vs Temperature (Tamb) using the cleaned dataset\n", - "sns.scatterplot(data=sierraleone_cleaned, x='GHI', y='Tamb')\n", - "plt.title('Global Horizontal Irradiance (GHI) vs Temperature (Tamb) - Sierra Leone (Cleaned Data)')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Time Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Coverting Timestamp to datetime and setting as Index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Convert the Timestamp column to datetime format\n", - "benin_cleaned['Timestamp'] = pd.to_datetime(benin_cleaned['Timestamp'])\n", - "sierraleone_cleaned['Timestamp'] = pd.to_datetime(sierraleone_cleaned['Timestamp'])\n", - "togo_cleaned['Timestamp'] = pd.to_datetime(togo_cleaned['Timestamp'])\n", - "\n", - "# Set the Timestamp column as the index for easier plotting\n", - "benin_cleaned.set_index('Timestamp', inplace=True)\n", - "sierraleone_cleaned.set_index('Timestamp', inplace=True)\n", - "togo_cleaned.set_index('Timestamp', inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Benin Time Series and Area Plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Plotting the time series for the Benin cleaned dataset\n", - "plt.figure(figsize=(14, 8))\n", - "plt.plot(benin_cleaned.index, benin_cleaned['GHI'], label='GHI')\n", - "plt.plot(benin_cleaned.index, benin_cleaned['DNI'], label='DNI')\n", - "plt.plot(benin_cleaned.index, benin_cleaned['DHI'], label='DHI')\n", - "plt.plot(benin_cleaned.index, benin_cleaned['Tamb'], label='Tamb')\n", - "\n", - "plt.xlabel('Time')\n", - "plt.ylabel('Values')\n", - "plt.title('Time Series Analysis of GHI, DNI, DHI, and Tamb in Benin (Cleaned Data)')\n", - "plt.legend()\n", - "plt.grid(True)\n", - "plt.show()\n", - "\n", - "# For area plot of the cleaned Benin dataset\n", - "benin_cleaned[['GHI', 'DNI', 'DHI', 'Tamb']].plot(kind='area', figsize=(14, 8), alpha=0.5)\n", - "plt.title('Area Plot of GHI, DNI, DHI, and Tamb in Benin (Cleaned Data)')\n", - "plt.xlabel('Time')\n", - "plt.ylabel('Values')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Sierra Leone Time Series and Area Plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Plotting the time series for the Sierra Leone cleaned dataset\n", - "plt.figure(figsize=(14, 8))\n", - "plt.plot(sierraleone_cleaned.index, sierraleone_cleaned['GHI'], label='GHI')\n", - "plt.plot(sierraleone_cleaned.index, sierraleone_cleaned['DNI'], label='DNI')\n", - "plt.plot(sierraleone_cleaned.index, sierraleone_cleaned['DHI'], label='DHI')\n", - "plt.plot(sierraleone_cleaned.index, sierraleone_cleaned['Tamb'], label='Tamb')\n", - "\n", - "plt.xlabel('Time')\n", - "plt.ylabel('Values')\n", - "plt.title('Time Series Analysis of GHI, DNI, DHI, and Tamb in Sierra Leone (Cleaned Data)')\n", - "plt.legend()\n", - "plt.grid(True)\n", - "plt.show()\n", - "\n", - "# For area plot of the cleaned Sierra Leone dataset\n", - "sierraleone_cleaned[['GHI', 'DNI', 'DHI', 'Tamb']].plot(kind='area', figsize=(14, 8), alpha=0.5)\n", - "plt.title('Area Plot of GHI, DNI, DHI, and Tamb in Sierra Leone (Cleaned Data)')\n", - "plt.xlabel('Time')\n", - "plt.ylabel('Values')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Togo Time Series and Area Plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Plotting the time series for the Togo cleaned dataset\n", - "plt.figure(figsize=(14, 8))\n", - "plt.plot(togo_cleaned.index, togo_cleaned['GHI'], label='GHI')\n", - "plt.plot(togo_cleaned.index, togo_cleaned['DNI'], label='DNI')\n", - "plt.plot(togo_cleaned.index, togo_cleaned['DHI'], label='DHI')\n", - "plt.plot(togo_cleaned.index, togo_cleaned['Tamb'], label='Tamb')\n", - "\n", - "plt.xlabel('Time')\n", - "plt.ylabel('Values')\n", - "plt.title('Time Series Analysis of GHI, DNI, DHI, and Tamb in Togo (Cleaned Data)')\n", - "plt.legend()\n", - "plt.grid(True)\n", - "plt.show()\n", - "\n", - "# For area plot of the cleaned Togo dataset\n", - "togo_cleaned[['GHI', 'DNI', 'DHI', 'Tamb']].plot(kind='area', figsize=(14, 8), alpha=0.5)\n", - "plt.title('Area Plot of GHI, DNI, DHI, and Tamb in Togo (Cleaned Data)')\n", - "plt.xlabel('Time')\n", - "plt.ylabel('Values')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Wind Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Function for the Polar Plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def create_polar_plot(data, title):\n", - " # Filter out rows where wind speed is zero or missing\n", - " filtered_data = data[data['WS'] > 0].copy()\n", - " \n", - " # Convert wind direction to radians for plotting\n", - " filtered_data['WD_rad'] = np.deg2rad(filtered_data['WD'])\n", - " \n", - " # Create the polar plot\n", - " plt.figure(figsize=(8, 8))\n", - " ax = plt.subplot(111, projection='polar')\n", - " \n", - " # Plot the data\n", - " scatter = ax.scatter(filtered_data['WD_rad'], filtered_data['WS'], \n", - " c=filtered_data['WS'], cmap='viridis', alpha=0.75)\n", - " \n", - " # Customize the plot\n", - " ax.set_theta_zero_location('N') # North is at the top\n", - " ax.set_theta_direction(-1) # Clockwise direction\n", - " plt.title(title)\n", - " \n", - " # Add color bar for wind speed\n", - " cbar = plt.colorbar(scatter)\n", - " cbar.set_label('Wind Speed (m/s)')\n", - " \n", - " # Display the plot\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### After Cleanig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Example usage with the cleaned datasets\n", - "create_polar_plot(benin_cleaned, 'Wind Speed and Direction - Benin (Cleaned Data)')\n", - "create_polar_plot(sierraleone_cleaned, 'Wind Speed and Direction - Sierra Leone (Cleaned Data)')\n", - "create_polar_plot(togo_cleaned, 'Wind Speed and Direction - Togo (Cleaned Data)')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Before Cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_polar_plot(sierraleone, 'Wind Speed and Direction - Sierra Leone Bumbuna')\n", - "create_polar_plot(togo, 'Wind Speed and Direction - Togo Dapaong')\n", - "create_polar_plot(benin, 'Wind Speed and Direction - Benin-malanville')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}