From 6b69c3295ab94d6bbc3bab7f386795be678ca0fa Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 4 Aug 2024 19:19:50 +0200 Subject: [PATCH 01/18] Add avocado price prediction linear regression example Signed-off-by: Omar Shrit --- .../avocado_price_prediction/Makefile | 44 ++++++ .../avocado_price_prediction.cpp | 140 ++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 cpp/linear_regression/avocado_price_prediction/Makefile create mode 100644 cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp diff --git a/cpp/linear_regression/avocado_price_prediction/Makefile b/cpp/linear_regression/avocado_price_prediction/Makefile new file mode 100644 index 00000000..5edc2976 --- /dev/null +++ b/cpp/linear_regression/avocado_price_prediction/Makefile @@ -0,0 +1,44 @@ +# This is a simple Makefile used to build the example source code. +# This example might requires some modifications in order to work correctly on +# your system. +# If you're not using the Armadillo wrapper, replace `armadillo` with linker commands +# for the BLAS and LAPACK libraries that you are using. + +TARGET := avocado_price_prediction +SRC := avocado_price_prediction.cpp +LIBS_NAME := armadillo + +CXX := g++ +CXXFLAGS += -std=c++17 -Wall -Wextra -O3 -DNDEBUG -fopenmp +# Use these CXXFLAGS instead if you want to compile with debugging symbols and +# without optimizations. +# CXXFLAGS += -std=c++17 -Wall -Wextra -g -O0 + +LDFLAGS += -fopenmp +# Add header directories for any includes that aren't on the +# default compiler search path. +INCLFLAGS := -I . +# If you have mlpack or ensmallen installed somewhere nonstandard, uncomment and +# update the lines below. +# Uncomment the following if you are using the Scatter function for plotting +# INCLFLAGS += -I/usr/include/python3.11 +# INCLFLAGS += -I/path/to/ensmallen/include/ +CXXFLAGS += $(INCLFLAGS) + +OBJS := $(SRC:.cpp=.o) +LIBS := $(addprefix -l,$(LIBS_NAME)) +CLEAN_LIST := $(TARGET) $(OBJS) + +# default rule +default: all + +$(TARGET): $(OBJS) + $(CXX) $(OBJS) -o $(TARGET) $(LDFLAGS) $(LIBS) + +.PHONY: all +all: $(TARGET) + +.PHONY: clean +clean: + @echo CLEAN $(CLEAN_LIST) + @rm -f $(CLEAN_LIST) diff --git a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp new file mode 100644 index 00000000..b742b020 --- /dev/null +++ b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp @@ -0,0 +1,140 @@ +/** +* Predicting Avocado's Average Price using Linear Regression +* Our target is to predict the future price of avocados depending on various +* features (Type, Region, Total Bags, ...). +* +* Dataset +* +* Avocado Prices dataset has the following features: +* PLU - Product Lookup Code in Hass avocado board. +* Date - The date of the observation. +* AveragePrice - Observed average price of single avocado. +* Total Volume - Total number of avocado's sold. +* 4046 - Total number of avocado's with PLU 4046 sold. +* 4225 - Total number of avocado's with PLU 4225 sold. +* 4770 - Total number of avocado's with PLU 4770 sold. +* Total Bags = Small Bags + Large Bags + XLarge Bags. +* Type - Conventional or organic. +* Year - Year of observation. +* Region - City or region of observation. +* +* Approach +* +* In this example, first we will do EDA on the dataset to find correlation +* between various features. +* Then we'll be using onehot encoding to encode categorical features. +* Finally we will use LinearRegression API from mlpack to learn the correlation +* between various features and the target i.e AveragePrice. +* After training the model, we will use it to do some predictions, followed by +* various evaluation metrics to quantify how well our model behaves. +*/ +#include +using namespace mlpack; +using namespace mlpack::data; + +//Drop the dataset header using sed, sed is a Unix utility that parses and transforms text." +//!mkdir -p data && cat avocado.csv | sed 1d > avocado_trim.csv" +//"Drop columns 1 and 2 (\"Unamed: 0\", \"Date\") as these are not required and their presence cause issues while loading the data." +//!rm avocado_trim.csv" +//"!mv avocado_trim2.csv avocado_trim.csv" + +int main() +{ + //!wget -q https://datasets.mlpack.org/avocado.csv.gz" + // Features 9 (Avocado type) and 11 (region of observation) are strings + // (categorical), but armadillo matrices can contain only numeric information; + // so, we have to explicitly define them as categorical in `datasetInfo` + // this allows mlpack to map numeric values to each of those values, + // which can later be unmapped to strings. + // Load the dataset into armadillo matrix. + + arma::mat matrix; + data::DatasetInfo info; + info.Type(9) = data::Datatype::categorical; + info.Type(11) = data::Datatype::categorical; + data::Load("avocado_trim.csv", matrix, info); + // Printing header for dataset. + std::cout << std::setw(10) << "AveragePrice" << std::setw(14) + << "Total Volume" << std::setw(9) << "4046" << std::setw(13) + << "4225" << std::setw(13) << "4770" << std::setw(17) << "Total Bags" + << std::setw(13) << "Small Bags" << std::setw(13) << "Large Bags" + << std::setw(17) << "XLarge Bags" << std::setw(10) << "Type" + << std::setw(10) << "Year" << std::setw(15) << "Region" << std::endl; + std::cout << matrix.submat(0, 0, matrix.n_rows-1, 5).t() << std::endl; + // Exploratory Data Analysis + arma::mat output; + data::OneHotEncoding(matrix, output, info); + arma::Row targets = arma::conv_to>::from(output.row(0)); + // Labels are dropped from the originally loaded data to be used as features. + output.shed_row(0); + // Train Test Split, + // The dataset has to be split into a training set and a test set. Here the + // dataset has 18249 observations and the `testRatio` is set to 20% of the + // total observations. This indicates the test set should have + // 20% * 18249 = 3649 observations and training test should have + // 14600 observations respectively. + arma::mat Xtrain; + arma::mat Xtest; + arma::Row Ytrain; + arma::Row Ytest; + data::Split(output, targets, Xtrain, Xtest, Ytrain, Ytest, 0.2); + // Convert armadillo Rows into rowvec. (Required by mlpacks' LinearRegression API in this format). + arma::rowvec yTrain = arma::conv_to::from(Ytrain); + arma::rowvec yTest = arma::conv_to::from(Ytest); + /* Training the linear model. + * Regression analysis is the most widely used method of prediction. + * Linear regression is used when the dataset has a linear correlation + * and as the name suggests, multiple linear regression has one independent + * variable (predictor) and one or more dependent variable(response). + * The simple linear regression equation is represented as + * y = $a + b_{1}x_{1} + b_{2}x_{2} + b_{3}x_{3} + ... + b_{n}x_{n}$ + * where $x_{i}$ is the ith explanatory variable, y is the dependent + * variable, $b_{i}$ is ith coefficient and a is the intercept. + * To perform linear regression we'll be using the `LinearRegression` class from mlpack. + * Create and train Linear Regression model. + */ + LinearRegression lr(Xtrain, yTrain, 0.5); + arma::rowvec yPreds; + lr.Predict(Xtest, yPreds); + // Save the yTest and yPreds into csv for generating plots. + arma::mat preds; + preds.insert_rows(0, yTest); + preds.insert_rows(1, yPreds); + arma::mat histpreds = yTest - yPreds; + mlpack::data::Save("./data/predictions.csv", preds); + mlpack::data::Save("./data/predsDiff.csv", yPreds); + //!sed -i '1iY_Test,Y_Preds' ./data/predictions.csv" + //!sed -i '1iY_Diff' ./data/predsDiff.csv" + /* + * Model Evaluation, + * Test data is visualized with `yTest` and `yPreds`, the blue points + * indicates the data points and the blue line indicates the regression + * line or best fit line. + * Evaluation Metrics for Regression model, + * In the previous cell we have visualized our model performance by plotting + * the best fit line. Now we will use various evaluation metrics to understand + * how well our model has performed. + * Mean Absolute Error (MAE) is the sum of absolute differences between actual + * and predicted values, without considering the direction. + * MAE = \\frac{\\sum_{i=1}^n\\lvert y_{i} - \\hat{y_{i}}\\rvert} {n} + * Mean Squared Error (MSE) is calculated as the mean or average of the + * squared differences between predicted and expected target values in + * a dataset, a lower value is better + * MSE = \\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2, + * Root Mean Squared Error (RMSE), Square root of MSE yields root mean square + * error (RMSE) it indicates the spread of the residual errors. It is always + * positive, and a lower value indicates better performance. + * RMSE = \\sqrt{\\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2} + */ + // Model evaluation metrics. + // From the above metrics, we can notice that our model MAE is ~0.2, + // which is relatively small compared to our average price of $1.405, + // from this and the above plot we can conclude our model is a reasonably good fit. + + std::cout << "Mean Absolute Error: " + << arma::mean(arma::abs(yPreds - yTest)) << std::endl; + std::cout << "Mean Squared Error: " + << arma::mean(arma::pow(yPreds - yTest,2)) << std::endl; + std::cout << "Root Mean Squared Error: " + << sqrt(arma::mean(arma::pow(yPreds - yTest,2))) << std::endl; +} From 940f0dcdfebd12a5009d26b223b90f2b942f46a8 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 4 Aug 2024 19:29:57 +0200 Subject: [PATCH 02/18] Adding the avocado dataset download script Signed-off-by: Omar Shrit --- scripts/download_data_set.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/download_data_set.py b/scripts/download_data_set.py index fe99476a..4dc11778 100755 --- a/scripts/download_data_set.py +++ b/scripts/download_data_set.py @@ -159,6 +159,12 @@ def covertype_dataset(): progress_bar("covertype-small.csv.gz", covertype) ungzip("covertype-small.csv.gz", "covertype-small.csv") +def avocado_dataset(): + print("Downloading the avocado price prediction dataset...") + avocado = requests.get("https://datasets.mlpack.org/avocado.csv.gz") + progress_bar("avocado.csv.gz", avocado) + ungzip("avocado.csv.gz", "avocado.csv") + def dominant_color_dataset(): print("Downloading dominant color dataset...") jurassic_park = requests.get("https://datasets.mlpack.org/jurassic-park.png") @@ -205,6 +211,7 @@ def all_datasets(): Usage: --dataset_name dataset_name Available options: + avocado: will download the avocado price prediction dataset mnist : will download mnist dataset electricity : will download electricty_consumption_dataset stock : will download stock_exchange dataset @@ -257,6 +264,9 @@ def all_datasets(): elif args.dataset_name == "color": create_dataset_dir() dominant_color_dataset() + elif args.dataset_name == "avocado": + create_dataset_dir() + avocado_dataset() elif args.dataset_name == "covertype": create_dataset_dir() covertype_dataset() From 739b4ae40fbd3c94e3dee04806d9d6c66d1d2e32 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 4 Aug 2024 19:51:36 +0200 Subject: [PATCH 03/18] Trim the first two columns with pandas Signed-off-by: Omar Shrit --- .../avocado_price_prediction.cpp | 10 +++++----- scripts/download_data_set.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp index b742b020..fc34ad58 100644 --- a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp +++ b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp @@ -47,12 +47,12 @@ int main() // this allows mlpack to map numeric values to each of those values, // which can later be unmapped to strings. // Load the dataset into armadillo matrix. - + arma::mat matrix; data::DatasetInfo info; info.Type(9) = data::Datatype::categorical; info.Type(11) = data::Datatype::categorical; - data::Load("avocado_trim.csv", matrix, info); + data::Load("../../../data/avocado.csv", matrix, info); // Printing header for dataset. std::cout << std::setw(10) << "AveragePrice" << std::setw(14) << "Total Volume" << std::setw(9) << "4046" << std::setw(13) @@ -81,6 +81,7 @@ int main() // Convert armadillo Rows into rowvec. (Required by mlpacks' LinearRegression API in this format). arma::rowvec yTrain = arma::conv_to::from(Ytrain); arma::rowvec yTest = arma::conv_to::from(Ytest); + /* Training the linear model. * Regression analysis is the most widely used method of prediction. * Linear regression is used when the dataset has a linear correlation @@ -103,8 +104,6 @@ int main() arma::mat histpreds = yTest - yPreds; mlpack::data::Save("./data/predictions.csv", preds); mlpack::data::Save("./data/predsDiff.csv", yPreds); - //!sed -i '1iY_Test,Y_Preds' ./data/predictions.csv" - //!sed -i '1iY_Diff' ./data/predsDiff.csv" /* * Model Evaluation, * Test data is visualized with `yTest` and `yPreds`, the blue points @@ -129,7 +128,8 @@ int main() // Model evaluation metrics. // From the above metrics, we can notice that our model MAE is ~0.2, // which is relatively small compared to our average price of $1.405, - // from this and the above plot we can conclude our model is a reasonably good fit. + // from this and the above plot we can conclude our model is a reasonably + // good fit. std::cout << "Mean Absolute Error: " << arma::mean(arma::abs(yPreds - yTest)) << std::endl; diff --git a/scripts/download_data_set.py b/scripts/download_data_set.py index 4dc11778..66c1a2af 100755 --- a/scripts/download_data_set.py +++ b/scripts/download_data_set.py @@ -9,6 +9,7 @@ import tarfile import textwrap from tqdm import tqdm +import pandas as pd import requests import shutil @@ -46,6 +47,11 @@ def convert(imgf, labelf, outf, n): l.close() t.close() + +def pull_csv(file): + csv_file = pd.read_csv(file, sep=',', comment='#') + return csv_file + def create_dataset_dir(): if os.path.exists("../data"): os.chdir("../data") @@ -164,6 +170,11 @@ def avocado_dataset(): avocado = requests.get("https://datasets.mlpack.org/avocado.csv.gz") progress_bar("avocado.csv.gz", avocado) ungzip("avocado.csv.gz", "avocado.csv") + avocado_data = pull_csv("avocado.csv") + avocado_data = avocado_data.iloc[:, 2:] + avocado_data.to_csv("avocado.csv", index=False) + + def dominant_color_dataset(): print("Downloading dominant color dataset...") From a1bfda120b5729844b39af4b651cac49c294a253 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 11 Aug 2024 15:51:42 +0200 Subject: [PATCH 04/18] California housing Signed-off-by: Omar Shrit --- scripts/download_data_set.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/scripts/download_data_set.py b/scripts/download_data_set.py index 66c1a2af..3dd78320 100755 --- a/scripts/download_data_set.py +++ b/scripts/download_data_set.py @@ -165,6 +165,11 @@ def covertype_dataset(): progress_bar("covertype-small.csv.gz", covertype) ungzip("covertype-small.csv.gz", "covertype-small.csv") +def california_housing_dataset(): + print("Downloading the california housing dataset...") + california = requests.get("https://datasets.mlpack.org/examples/housing.csv") + progress_bar("housing.csv", california) + def avocado_dataset(): print("Downloading the avocado price prediction dataset...") avocado = requests.get("https://datasets.mlpack.org/avocado.csv.gz") @@ -223,17 +228,18 @@ def all_datasets(): Usage: --dataset_name dataset_name Available options: avocado: will download the avocado price prediction dataset - mnist : will download mnist dataset - electricity : will download electricty_consumption_dataset - stock : will download stock_exchange dataset - iris : will downlaod the iris dataset bodyFat : will download the bodyFat dataset - spam : will download the spam dataset - salary: will download the salary dataset + california: will download the california housing dataset cifar10: will download the cifar10 dataset - pima: will download the pima diabetes dataset color: will download the dominant color dataset covertype: will download the forest covertype dataset + electricity : will download electricty_consumption_dataset + iris : will downlaod the iris dataset + mnist : will download mnist dataset + pima: will download the pima diabetes dataset + salary: will download the salary dataset + spam : will download the spam dataset + stock : will download stock_exchange dataset all : will download all datasets for all examples ''')) @@ -260,6 +266,9 @@ def all_datasets(): elif args.dataset_name == "bodyFat": create_dataset_dir() body_fat_dataset() + elif args.dataset_name == "california": + create_dataset_dir() + california_housing_dataset() elif args.dataset_name == "spam": create_dataset_dir() spam_dataset() From 25b616de7d51702409d86d105e56c69bba2d8bf2 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 29 Sep 2024 19:20:54 +0200 Subject: [PATCH 05/18] Adding salary prediction Signed-off-by: Omar Shrit --- .../salary_prediction/salary-prediction.cpp | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 cpp/linear_regression/salary_prediction/salary-prediction.cpp diff --git a/cpp/linear_regression/salary_prediction/salary-prediction.cpp b/cpp/linear_regression/salary_prediction/salary-prediction.cpp new file mode 100644 index 00000000..0ff5384a --- /dev/null +++ b/cpp/linear_regression/salary_prediction/salary-prediction.cpp @@ -0,0 +1,103 @@ +/** + * Predicting Salary using Linear Regression. + * Objective: + * We have to predict the salary of an employee given how many years of experience they have. + * Dataset: + * Salary_Data.csv has 2 columns — “Years of Experience” (feature) and “Salary” + * (target) for 30 employees in a company. + * Approach: + * So in this example, we will train a Linear Regression model to learn the + * correlation between the number of years of experience of each employee and + * their respective salary. + * Once the model is trained, we will be able to do some sample predictions. +*/ +// !wget -q https://datasets.mlpack.org/Salary_Data.csv" +#include +#include + +using namespace mlpack; + +int main() +{ + // Load the dataset into armadillo matrix. + arma::mat inputs; + data::Load("Salary_Data.csv" inputs); + // Drop the first row as they represent header. + inputs.shed_col(0); + // Display the first 5 rows of the input data. + std::cout << std::setw(18) << "Years Of Experience" << std::setw(10) << "Salary" << std::endl; + std::cout << inputs.submat(0, 0, inputs.n_rows-1, 5).t() << std::endl; + + // Split the data into features (X) and target (y) variables + // targets are the last row. + arma::Row targets = + arma::conv_to>::from(inputs.row(inputs.n_rows - 1)); + // Labels are dropped from the originally loaded data to be used as features. + inputs.shed_row(inputs.n_rows - 1); + + /* + * The dataset has to be split into a training set and a test set. + * This can be done using the `data::Split()` api from mlpack. + * Here the dataset has 30 observations and the `testRatio` is taken as 40% + * of the total observations. + * This indicates the test set should have 40% * 30 = 12 observations and + * training test should have 18 observations respectively. + * Split the dataset into train and test sets using mlpack. + */ + arma::mat Xtrain, Xtest; + arma::Row Ytrain, Ytest; + data::Split(inputs, targets, Xtrain, Xtest, Ytrain, Ytest, 0.4); + + // Convert armadillo Rows into rowvec. (Required by mlpacks' + // LinearRegression API in this format). + arma::rowvec yTrain = arma::conv_to::from(Ytrain); + arma::rowvec yTest = arma::conv_to::from(Ytest); + + /* + * Regression analysis is the most widely used method of prediction. Linear + * regression is used when the dataset has a linear correlation and as the + * name suggests, simple linear regression has one independent variable + * (predictor) and one dependent variable(response). + * The simple linear regression equation is represented as + * $y = a+bx$ where $x$ is the explanatory variable, $y$ is the dependent + * variable, $b$ is coefficient and $a$ is the intercept. + * To perform linear regression we'll be using `LinearRegression()` + * api from mlpack. + */ + + //Create and Train Linear Regression model. + LinearRegression lr(Xtrain, yTrain, 0.5); + + // Make predictions for test data points. + arma::rowvec yPreds; + lr.Predict(Xtest, yPreds); + + // Convert armadillo vectors and matrices to vector for plotting purpose. + std::vector XtestPlot = arma::conv_to>::from(Xtest); + std::vector yTestPlot = arma::conv_to>::from(yTest); + std::vector yPredsPlot = arma::conv_to>::from(yPreds); + + /* + * Evaluation Metrics for Regression model. + * In the Previous cell we have visualized our model performance by plotting. + * the best fit line. Now we will use various evaluation metrics to understand + * how well our model has performed. + * Mean Absolute Error (MAE) is the sum of absolute differences between actual + * and predicted values, without considering the direction. + * $$ MAE = \\frac{\\sum_{i=1}^n\\lvert y_{i} - \\hat{y_{i}}\\rvert} {n} $$ + * Mean Squared Error (MSE) is calculated as the mean or average of the + * squared differences between predicted and expected target values in a + * dataset, a lower value is better + * $$ MSE = \\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2 $$ + * Root Mean Squared Error (RMSE), Square root of MSE yields root mean square + * error (RMSE) it indicates the spread of the residual errors. It is always + * positive, and a lower value indicates better performance. + * $$ RMSE = \\sqrt{\\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2} $$ + */ + std::cout << "Mean Absolute Error: " + << arma::mean(arma::abs(yPreds - yTest)) << std::endl; + std::cout << "Mean Squared Error: " + << arma::mean(arma::pow(yPreds - yTest,2)) << std::endl; + std::cout << "Root Mean Squared Error: " + << sqrt(arma::mean(arma::pow(yPreds - yTest,2))) << std::endl; +} From d83aafbb912f2e8b70ca5aabb8e9a864c580865f Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 29 Sep 2024 19:23:04 +0200 Subject: [PATCH 06/18] Add a make file make it compile Signed-off-by: Omar Shrit --- .../salary_prediction/Makefile | 44 +++++++++++++++++++ .../salary_prediction/salary-prediction.cpp | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 cpp/linear_regression/salary_prediction/Makefile diff --git a/cpp/linear_regression/salary_prediction/Makefile b/cpp/linear_regression/salary_prediction/Makefile new file mode 100644 index 00000000..0f1ed677 --- /dev/null +++ b/cpp/linear_regression/salary_prediction/Makefile @@ -0,0 +1,44 @@ +# This is a simple Makefile used to build the example source code. +# This example might requires some modifications in order to work correctly on +# your system. +# If you're not using the Armadillo wrapper, replace `armadillo` with linker commands +# for the BLAS and LAPACK libraries that you are using. + +TARGET := salary_prediction +SRC := salary-prediction.cpp +LIBS_NAME := armadillo + +CXX := g++ +CXXFLAGS += -std=c++17 -Wall -Wextra -O3 -DNDEBUG -fopenmp +# Use these CXXFLAGS instead if you want to compile with debugging symbols and +# without optimizations. +# CXXFLAGS += -std=c++17 -Wall -Wextra -g -O0 + +LDFLAGS += -fopenmp +# Add header directories for any includes that aren't on the +# default compiler search path. +INCLFLAGS := -I . +# If you have mlpack or ensmallen installed somewhere nonstandard, uncomment and +# update the lines below. +# Uncomment the following if you are using the Scatter function for plotting +# INCLFLAGS += -I/usr/include/python3.11 +# INCLFLAGS += -I/path/to/ensmallen/include/ +CXXFLAGS += $(INCLFLAGS) + +OBJS := $(SRC:.cpp=.o) +LIBS := $(addprefix -l,$(LIBS_NAME)) +CLEAN_LIST := $(TARGET) $(OBJS) + +# default rule +default: all + +$(TARGET): $(OBJS) + $(CXX) $(OBJS) -o $(TARGET) $(LDFLAGS) $(LIBS) + +.PHONY: all +all: $(TARGET) + +.PHONY: clean +clean: + @echo CLEAN $(CLEAN_LIST) + @rm -f $(CLEAN_LIST) diff --git a/cpp/linear_regression/salary_prediction/salary-prediction.cpp b/cpp/linear_regression/salary_prediction/salary-prediction.cpp index 0ff5384a..ab5ba033 100644 --- a/cpp/linear_regression/salary_prediction/salary-prediction.cpp +++ b/cpp/linear_regression/salary_prediction/salary-prediction.cpp @@ -21,7 +21,7 @@ int main() { // Load the dataset into armadillo matrix. arma::mat inputs; - data::Load("Salary_Data.csv" inputs); + data::Load("Salary_Data.csv", inputs); // Drop the first row as they represent header. inputs.shed_col(0); // Display the first 5 rows of the input data. From a0a770ccf6bc6e915cbe1351645210dce79bde91 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 29 Sep 2024 19:26:31 +0200 Subject: [PATCH 07/18] Adding another example Signed-off-by: Omar Shrit --- .../Makefile | 44 +++++ .../california_house_price_prediction.cpp | 173 ++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 cpp/linear_regression/california_housing_price_prediction/Makefile create mode 100644 cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp diff --git a/cpp/linear_regression/california_housing_price_prediction/Makefile b/cpp/linear_regression/california_housing_price_prediction/Makefile new file mode 100644 index 00000000..315fd2cb --- /dev/null +++ b/cpp/linear_regression/california_housing_price_prediction/Makefile @@ -0,0 +1,44 @@ +# This is a simple Makefile used to build the example source code. +# This example might requires some modifications in order to work correctly on +# your system. +# If you're not using the Armadillo wrapper, replace `armadillo` with linker commands +# for the BLAS and LAPACK libraries that you are using. + +TARGET := california-house-price-prediction +SRC := california_house_price_prediction.cpp +LIBS_NAME := armadillo + +CXX := g++ +CXXFLAGS += -std=c++17 -Wall -Wextra -O3 -DNDEBUG -fopenmp +# Use these CXXFLAGS instead if you want to compile with debugging symbols and +# without optimizations. +# CXXFLAGS += -std=c++17 -Wall -Wextra -g -O0 + +LDFLAGS += -fopenmp +# Add header directories for any includes that aren't on the +# default compiler search path. +INCLFLAGS := -I . +# If you have mlpack or ensmallen installed somewhere nonstandard, uncomment and +# update the lines below. +# Uncomment the following if you are using the Scatter function for plotting +# INCLFLAGS += -I/usr/include/python3.11 +# INCLFLAGS += -I/path/to/ensmallen/include/ +CXXFLAGS += $(INCLFLAGS) + +OBJS := $(SRC:.cpp=.o) +LIBS := $(addprefix -l,$(LIBS_NAME)) +CLEAN_LIST := $(TARGET) $(OBJS) + +# default rule +default: all + +$(TARGET): $(OBJS) + $(CXX) $(OBJS) -o $(TARGET) $(LDFLAGS) $(LIBS) + +.PHONY: all +all: $(TARGET) + +.PHONY: clean +clean: + @echo CLEAN $(CLEAN_LIST) + @rm -f $(CLEAN_LIST) diff --git a/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp new file mode 100644 index 00000000..7bec161e --- /dev/null +++ b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp @@ -0,0 +1,173 @@ +/** +* Predicting California House Prices with Linear Regression +* +* Objective +* +* To predict California Housing Prices using the most simple Linear Regression +* Model and see how it performs. To understand the modeling workflow using mlpack. + +* About the Data +* +* This dataset is a modified version of the California Housing dataset available +* from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the +* StatLib repository (which is closed now). The dataset may also be downloaded +* from StatLib mirrors. +* +* This dataset is also used in a book HandsOn-ML (a very good book and highly +* recommended: https://www.oreilly.com/library/view/hands-on-machine-learning/9781491962282/]). +* +* The dataset in this directory is almost identical to the original, with two +* differences: +* 207 values were randomly removed from the totalbedrooms column, so we can +* discuss what to do with missing data. An additional categorical attribute +* called oceanproximity was added, indicating (very roughly) whether each +* block group is near the ocean, near the Bay area, inland or on an island. +* This allows discussing what to do with categorical data. +* Note that the block groups are called \"districts\" in the Jupyter notebooks, +* simply because in some contexts the name \"block group\" was confusing. + +* Lets look at the features of the dataset: +* +* Longitude : Longitude coordinate of the houses. +* Latitude : Latitude coordinate of the houses. +* Housing Median Age : Average lifespan of houses. +* Total Rooms : Number of rooms in a location. +* Total Bedrooms : Number of bedroooms in a location. +* Population : Population in that location. +* Median Income : Median Income of households in a location. +* Median House Value : Median House Value in a location. +* Ocean Proximity : Closeness to shore. + +* Approach +* +* Here, we will try to recreate the workflow from the book mentioned above. +* Pre-Process the data for the Ml Algorithm. +* Create new features. +* Splitting the data. +* Training the ML model using mlpack. +* Residuals, Errors and Conclusion. +*/ + +#include + +using namespace mlpack; +using namespace mlpack::data; + + + //"But, there's one thing which we need to do before loading the dataset as an Armadillo matrix; that is, we need to deal with any missing values. Since 207 values were removed from the original dataset from \"total_bedrooms_column\", we need to fill them using either \"mean\" or \"median\" of that feature (for numerical) and \"mode\" (for categorical)." + //"// The imputing functions follows this:\n", + //// Impute(inputFile, outputFile, kind);\n", + //// Here, inputFile is our raw file, outputFile is our new file with the imputations, \n", + //// and kind refers to imputation method.\n", + +int main() +{ + /** + * we need to load the dataset as an Armadillo matrix for further operations. + * Our dataset has a total of 9 features: 8 numerical and + * 1 categorical(ocean proximity). We need to map the + * categorical features, as armadillo operates on numeric + * values only. + */ + arma::mat dataset; + data::DatasetInfo info; + info.Type(9) = mlpack::data::Datatype::categorical; + data::Load("housing_without_header.csv", dataset, info); + + // Should remove this and use brief_print function from arma + // Print the first 6 rows of the input data. + std::cout << dataset.submat(0, 0, dataset.n_rows - 1 , 5)<< std::endl; + + /* + * Did you notice something? Yes, the last row looks like it is entirely + * filled with '0'. Let's check our dataset to see what it corresponds to. + * It corresponds to Ocean Proximity which is a categorical value, but here + * it is zero. + * WWhy? It's because the load function loads numerical values only. This is + * exactly why we mapped Ocean proximity earlier. + * So, let's deal with this. + */ + arma::mat encoded_dataset; + // Here, we chose our pre-built encoding method "One Hot Encoding" to deal + // with the categorical values. + data::OneHotEncoding(dataset, encoded_dataset, info); + // The dataset needs to be split into a training and testing set before we learn any model. + // Labels are median_house_value which is row 8 + arma::rowvec labels = + arma::conv_to::from(encoded_dataset.row(8)); + encoded_dataset.shed_row(8); + arma::mat trainSet, testSet; + arma::rowvec trainLabels, testLabels; + // Split dataset randomly into training set and test set. + data::Split(encoded_dataset, labels, trainSet, testSet, trainLabels, testLabels, + 0.2 /* Percentage of dataset to use for test set. */); + + // Training the linear model + /* Regression analysis is the most widely used method of prediction. + * Linear regression is used when the dataset has a linear correlation + * and as the name suggests, multiple linear regression has one independent + * variable (predictor) and one or more dependent variable(response). + */ + + /** + * The simple linear regression equation is represented as + * y = $a + b_{1}x_{1} + b_{2}x_{2} + b_{3}x_{3} + ... + b_{n}x_{n}$ + * where: + * $x_{i}$ is the ith explanatory variable, + * y is the dependent variable, + * $b_{i}$ is ith coefficient and a is the intercept. + */ + + /* To perform linear regression we'll be using the `LinearRegression` + * class from mlpack. + */ + LinearRegression lr(trainSet, trainLabels, 0.5); + + // The line above creates and train the model. + // Let's create a output vector for storing the results. + arma::rowvec output; + lr.Predict(testSet, output); + lr.ComputeError(trainSet, trainLabels); + std::cout << lr.ComputeError(trainSet, trainLabels); + + // Let's manually check some predictions. + std::cout << testLabels[1] << std::endl; + std::cout << output[1] << std::endl;" + std::cout << testLabels[7] << std::endl; + std::cout << output[7] << std::endl;" + arma::mat preds; + preds.insert_rows(0, testLabels); + preds.insert_rows(1, output); + + arma::mat diffs = preds.row(1) - preds.row(0); + data::Save("preds.csv", preds); + data::Save("predsDiff.csv", diffs); + + /** + * Model Evaluation + * Evaluation Metrics for Regression model + * In the previous cell we have visualized our model performance by plotting + * the best fit line. Now we will use various evaluation metrics to understand + * how well our model has performed. + * Mean Absolute Error (MAE) is the sum of absolute differences between actual + * and predicted values, without considering the direction. + * MAE = \\frac{\\sum_{i=1}^n\\lvert y_{i} - \\hat{y_{i}}\\rvert} {n} + * Mean Squared Error (MSE) is calculated as the mean or average of the + * squared differences between predicted and expected target values in a + * dataset, a lower value is better + * MSE = \\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2 + * Root Mean Squared Error (RMSE), Square root of MSE yields + * root mean square error (RMSE) it indicates the spread of + * the residual errors. It is always positive, and a lower + * value indicates better performance. + * RMSE = \\sqrt{\\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2} + */ + + std::cout << "Mean Absolute Error: " + << arma::mean(arma::abs(output - testLabels)) << std::endl; + std::cout << "Mean Squared Error: " + << arma::mean(arma::pow(output - testLabels,2)) << std::endl; + std::cout << "Root Mean Squared Error: " + << sqrt(arma::mean(arma::pow(output - testLabels,2))) << std::endl; +} + From f04f9d440902437fef9f707de5e78bfb772a8d13 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 29 Sep 2024 19:28:27 +0200 Subject: [PATCH 08/18] Compiling.. Signed-off-by: Omar Shrit --- .../california_house_price_prediction.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp index 7bec161e..4c0ccf41 100644 --- a/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp +++ b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp @@ -132,9 +132,9 @@ int main() // Let's manually check some predictions. std::cout << testLabels[1] << std::endl; - std::cout << output[1] << std::endl;" + std::cout << output[1] << std::endl; std::cout << testLabels[7] << std::endl; - std::cout << output[7] << std::endl;" + std::cout << output[7] << std::endl; arma::mat preds; preds.insert_rows(0, testLabels); preds.insert_rows(1, output); @@ -162,7 +162,6 @@ int main() * value indicates better performance. * RMSE = \\sqrt{\\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2} */ - std::cout << "Mean Absolute Error: " << arma::mean(arma::abs(output - testLabels)) << std::endl; std::cout << "Mean Squared Error: " From a283367cb2059d307df5fbcf7e9ed4e8b71ddbd1 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 29 Sep 2024 19:39:01 +0200 Subject: [PATCH 09/18] Salary prediction example functional Signed-off-by: Omar Shrit --- cpp/linear_regression/salary_prediction/salary-prediction.cpp | 2 +- scripts/download_data_set.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/linear_regression/salary_prediction/salary-prediction.cpp b/cpp/linear_regression/salary_prediction/salary-prediction.cpp index ab5ba033..9d5ee967 100644 --- a/cpp/linear_regression/salary_prediction/salary-prediction.cpp +++ b/cpp/linear_regression/salary_prediction/salary-prediction.cpp @@ -21,7 +21,7 @@ int main() { // Load the dataset into armadillo matrix. arma::mat inputs; - data::Load("Salary_Data.csv", inputs); + data::Load("../../../data/Salary_Data.csv", inputs); // Drop the first row as they represent header. inputs.shed_col(0); // Display the first 5 rows of the input data. diff --git a/scripts/download_data_set.py b/scripts/download_data_set.py index 3dd78320..70d410a6 100755 --- a/scripts/download_data_set.py +++ b/scripts/download_data_set.py @@ -178,8 +178,6 @@ def avocado_dataset(): avocado_data = pull_csv("avocado.csv") avocado_data = avocado_data.iloc[:, 2:] avocado_data.to_csv("avocado.csv", index=False) - - def dominant_color_dataset(): print("Downloading dominant color dataset...") From 76f3ed54ce01124600d1611b2188aa52dfbb70b5 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Sun, 29 Sep 2024 20:50:28 +0200 Subject: [PATCH 10/18] Adding the california housing price prediction example Signed-off-by: Omar Shrit --- .../california_house_price_prediction.cpp | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp index 4c0ccf41..27d9fc1d 100644 --- a/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp +++ b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp @@ -54,11 +54,16 @@ using namespace mlpack; using namespace mlpack::data; - //"But, there's one thing which we need to do before loading the dataset as an Armadillo matrix; that is, we need to deal with any missing values. Since 207 values were removed from the original dataset from \"total_bedrooms_column\", we need to fill them using either \"mean\" or \"median\" of that feature (for numerical) and \"mode\" (for categorical)." - //"// The imputing functions follows this:\n", - //// Impute(inputFile, outputFile, kind);\n", - //// Here, inputFile is our raw file, outputFile is our new file with the imputations, \n", - //// and kind refers to imputation method.\n", +//But, there's one thing which we need to do before loading the dataset +//as an Armadillo matrix; that is, we need to deal with any missing values. +//Since 207 values were removed from the original dataset from +//\"total_bedrooms_column\", we need to fill them using either +//\"mean\" or \"median\" of that feature (for numerical) and +//\"mode\" (for categorical). +// The imputing functions follows this +// Impute(inputFile, outputFile, kind) +// Here, inputFile is our raw file, outputFile is our new file with the imputations. +// and kind refers to imputation method. int main() { @@ -72,12 +77,11 @@ int main() arma::mat dataset; data::DatasetInfo info; info.Type(9) = mlpack::data::Datatype::categorical; - data::Load("housing_without_header.csv", dataset, info); + // Please remove the header of the file if exist, otherwise the results will + // not work + data::Load("../../../data/housing.csv", dataset, info); + dataset.brief_print(); - // Should remove this and use brief_print function from arma - // Print the first 6 rows of the input data. - std::cout << dataset.submat(0, 0, dataset.n_rows - 1 , 5)<< std::endl; - /* * Did you notice something? Yes, the last row looks like it is entirely * filled with '0'. Let's check our dataset to see what it corresponds to. From 56064a544ec67c4ff34b00c7af2951168bb22dda Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Wed, 2 Oct 2024 17:59:18 +0200 Subject: [PATCH 11/18] Update cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp Co-authored-by: Ryan Curtin --- .../avocado_price_prediction/avocado_price_prediction.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp index fc34ad58..4eeecf19 100644 --- a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp +++ b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp @@ -75,8 +75,8 @@ int main() // 14600 observations respectively. arma::mat Xtrain; arma::mat Xtest; - arma::Row Ytrain; - arma::Row Ytest; + arma::rowvec Ytrain; + arma::rowvec Ytest; data::Split(output, targets, Xtrain, Xtest, Ytrain, Ytest, 0.2); // Convert armadillo Rows into rowvec. (Required by mlpacks' LinearRegression API in this format). arma::rowvec yTrain = arma::conv_to::from(Ytrain); From b005ecac4ed3a359492da8a413693361f5b7dc8a Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Wed, 2 Oct 2024 17:59:38 +0200 Subject: [PATCH 12/18] Update cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp Co-authored-by: Ryan Curtin --- .../avocado_price_prediction/avocado_price_prediction.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp index 4eeecf19..df503510 100644 --- a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp +++ b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp @@ -67,6 +67,7 @@ int main() arma::Row targets = arma::conv_to>::from(output.row(0)); // Labels are dropped from the originally loaded data to be used as features. output.shed_row(0); + // Train Test Split, // The dataset has to be split into a training set and a test set. Here the // dataset has 18249 observations and the `testRatio` is set to 20% of the From 529d66bf93f5ad5bb3223740b598e5baf5c83483 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Wed, 2 Oct 2024 17:59:54 +0200 Subject: [PATCH 13/18] Update cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp Co-authored-by: Ryan Curtin --- .../avocado_price_prediction/avocado_price_prediction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp index df503510..b3f1fcd5 100644 --- a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp +++ b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp @@ -29,8 +29,8 @@ * various evaluation metrics to quantify how well our model behaves. */ #include + using namespace mlpack; -using namespace mlpack::data; //Drop the dataset header using sed, sed is a Unix utility that parses and transforms text." //!mkdir -p data && cat avocado.csv | sed 1d > avocado_trim.csv" From 3a252157f3a73a8bb62fd374de4dc90c0a6b7974 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 4 Oct 2024 12:59:32 +0200 Subject: [PATCH 14/18] Fix the example and make it run Signed-off-by: Omar Shrit --- .../avocado_price_prediction.cpp | 214 ++++++++---------- 1 file changed, 92 insertions(+), 122 deletions(-) diff --git a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp index b3f1fcd5..aa70ab1b 100644 --- a/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp +++ b/cpp/linear_regression/avocado_price_prediction/avocado_price_prediction.cpp @@ -2,140 +2,110 @@ * Predicting Avocado's Average Price using Linear Regression * Our target is to predict the future price of avocados depending on various * features (Type, Region, Total Bags, ...). -* -* Dataset -* -* Avocado Prices dataset has the following features: -* PLU - Product Lookup Code in Hass avocado board. -* Date - The date of the observation. -* AveragePrice - Observed average price of single avocado. -* Total Volume - Total number of avocado's sold. -* 4046 - Total number of avocado's with PLU 4046 sold. -* 4225 - Total number of avocado's with PLU 4225 sold. -* 4770 - Total number of avocado's with PLU 4770 sold. -* Total Bags = Small Bags + Large Bags + XLarge Bags. -* Type - Conventional or organic. -* Year - Year of observation. -* Region - City or region of observation. -* * Approach * -* In this example, first we will do EDA on the dataset to find correlation -* between various features. -* Then we'll be using onehot encoding to encode categorical features. -* Finally we will use LinearRegression API from mlpack to learn the correlation -* between various features and the target i.e AveragePrice. +* In this example, we will be using one hot encoding to encode categorical +* features. Then, we will use LinearRegression API from mlpack to learn +* the correlation between various features and the target i.e AveragePrice. * After training the model, we will use it to do some predictions, followed by * various evaluation metrics to quantify how well our model behaves. */ + #include using namespace mlpack; -//Drop the dataset header using sed, sed is a Unix utility that parses and transforms text." -//!mkdir -p data && cat avocado.csv | sed 1d > avocado_trim.csv" -//"Drop columns 1 and 2 (\"Unamed: 0\", \"Date\") as these are not required and their presence cause issues while loading the data." -//!rm avocado_trim.csv" -//"!mv avocado_trim2.csv avocado_trim.csv" - int main() { - //!wget -q https://datasets.mlpack.org/avocado.csv.gz" - // Features 9 (Avocado type) and 11 (region of observation) are strings - // (categorical), but armadillo matrices can contain only numeric information; - // so, we have to explicitly define them as categorical in `datasetInfo` - // this allows mlpack to map numeric values to each of those values, - // which can later be unmapped to strings. - // Load the dataset into armadillo matrix. + /** Dataset + * + * Avocado Prices dataset has the following features: + * PLU - Product Lookup Code in Hass avocado board. + * Date - The date of the observation. + * AveragePrice - Observed average price of single avocado. + * Total Volume - Total number of avocado's sold. + * 4046 - Total number of avocado's with PLU 4046 sold. + * 4225 - Total number of avocado's with PLU 4225 sold. + * 4770 - Total number of avocado's with PLU 4770 sold. + * Total Bags = Small Bags + Large Bags + XLarge Bags. + * Type - Conventional or organic. + * Year - Year of observation. + * Region - City or region of observation. + * + * 9 Avocado type and 11 region of observation are categorical string, + * but armadillo matrices can contain only numeric information + * Therefore, we explicitly define them as categorical in `datasetInfo` + * this allows mlpack to map numeric values to each of those values, + * which can later be unmapped to strings. + */ + /** PLEASE, delete the header of the dataset once you have downloaded the + * datset to your data/ directory. **/ + // Load the dataset into armadillo matrix. + arma::mat matrix; + data::DatasetInfo info; + info.Type(9) = data::Datatype::categorical; + info.Type(11) = data::Datatype::categorical; + data::Load("../../../data/avocado.csv", matrix, info); - arma::mat matrix; - data::DatasetInfo info; - info.Type(9) = data::Datatype::categorical; - info.Type(11) = data::Datatype::categorical; - data::Load("../../../data/avocado.csv", matrix, info); - // Printing header for dataset. - std::cout << std::setw(10) << "AveragePrice" << std::setw(14) - << "Total Volume" << std::setw(9) << "4046" << std::setw(13) - << "4225" << std::setw(13) << "4770" << std::setw(17) << "Total Bags" - << std::setw(13) << "Small Bags" << std::setw(13) << "Large Bags" - << std::setw(17) << "XLarge Bags" << std::setw(10) << "Type" - << std::setw(10) << "Year" << std::setw(15) << "Region" << std::endl; - std::cout << matrix.submat(0, 0, matrix.n_rows-1, 5).t() << std::endl; - // Exploratory Data Analysis - arma::mat output; - data::OneHotEncoding(matrix, output, info); - arma::Row targets = arma::conv_to>::from(output.row(0)); - // Labels are dropped from the originally loaded data to be used as features. - output.shed_row(0); - - // Train Test Split, - // The dataset has to be split into a training set and a test set. Here the - // dataset has 18249 observations and the `testRatio` is set to 20% of the - // total observations. This indicates the test set should have - // 20% * 18249 = 3649 observations and training test should have - // 14600 observations respectively. - arma::mat Xtrain; - arma::mat Xtest; - arma::rowvec Ytrain; - arma::rowvec Ytest; - data::Split(output, targets, Xtrain, Xtest, Ytrain, Ytest, 0.2); - // Convert armadillo Rows into rowvec. (Required by mlpacks' LinearRegression API in this format). - arma::rowvec yTrain = arma::conv_to::from(Ytrain); - arma::rowvec yTest = arma::conv_to::from(Ytest); + arma::mat output; + data::OneHotEncoding(matrix, output, info); + arma::rowvec targets = arma::conv_to::from(output.row(0)); - /* Training the linear model. - * Regression analysis is the most widely used method of prediction. - * Linear regression is used when the dataset has a linear correlation - * and as the name suggests, multiple linear regression has one independent - * variable (predictor) and one or more dependent variable(response). - * The simple linear regression equation is represented as - * y = $a + b_{1}x_{1} + b_{2}x_{2} + b_{3}x_{3} + ... + b_{n}x_{n}$ - * where $x_{i}$ is the ith explanatory variable, y is the dependent - * variable, $b_{i}$ is ith coefficient and a is the intercept. - * To perform linear regression we'll be using the `LinearRegression` class from mlpack. - * Create and train Linear Regression model. - */ - LinearRegression lr(Xtrain, yTrain, 0.5); - arma::rowvec yPreds; - lr.Predict(Xtest, yPreds); - // Save the yTest and yPreds into csv for generating plots. - arma::mat preds; - preds.insert_rows(0, yTest); - preds.insert_rows(1, yPreds); - arma::mat histpreds = yTest - yPreds; - mlpack::data::Save("./data/predictions.csv", preds); - mlpack::data::Save("./data/predsDiff.csv", yPreds); - /* - * Model Evaluation, - * Test data is visualized with `yTest` and `yPreds`, the blue points - * indicates the data points and the blue line indicates the regression - * line or best fit line. - * Evaluation Metrics for Regression model, - * In the previous cell we have visualized our model performance by plotting - * the best fit line. Now we will use various evaluation metrics to understand - * how well our model has performed. - * Mean Absolute Error (MAE) is the sum of absolute differences between actual - * and predicted values, without considering the direction. - * MAE = \\frac{\\sum_{i=1}^n\\lvert y_{i} - \\hat{y_{i}}\\rvert} {n} - * Mean Squared Error (MSE) is calculated as the mean or average of the - * squared differences between predicted and expected target values in - * a dataset, a lower value is better - * MSE = \\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2, - * Root Mean Squared Error (RMSE), Square root of MSE yields root mean square - * error (RMSE) it indicates the spread of the residual errors. It is always - * positive, and a lower value indicates better performance. - * RMSE = \\sqrt{\\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2} - */ - // Model evaluation metrics. - // From the above metrics, we can notice that our model MAE is ~0.2, - // which is relatively small compared to our average price of $1.405, - // from this and the above plot we can conclude our model is a reasonably - // good fit. + // Labels are dropped from the originally loaded data to be used as features. + output.shed_row(0); + + // Train Test Split, + // The dataset has to be split into a training set and a test set. Here the + // dataset has 18249 observations and the `testRatio` is set to 20% of the + // total observations. This indicates the test set should have + // 20% * 18249 = 3649 observations and training test should have + // 14600 observations respectively. + arma::mat Xtrain, Xtest; + arma::rowvec Ytrain, Ytest; + data::Split(output, targets, Xtrain, Xtest, Ytrain, Ytest, 0.2); - std::cout << "Mean Absolute Error: " - << arma::mean(arma::abs(yPreds - yTest)) << std::endl; - std::cout << "Mean Squared Error: " - << arma::mean(arma::pow(yPreds - yTest,2)) << std::endl; - std::cout << "Root Mean Squared Error: " - << sqrt(arma::mean(arma::pow(yPreds - yTest,2))) << std::endl; + /* Training the linear model. + * Regression analysis is the most widely used method of prediction. + * Linear regression is used when the dataset has a linear correlation + * and as the name suggests, multiple linear regression has one independent + * variable (predictor) and one or more dependent variable(response). + * The simple linear regression equation is represented as + * y = $a + b_{1}x_{1} + b_{2}x_{2} + b_{3}x_{3} + ... + b_{n}x_{n}$ + * where $x_{i}$ is the ith explanatory variable, y is the dependent + * variable, $b_{i}$ is ith coefficient and a is the intercept. + * To perform linear regression we'll be using the `LinearRegression` class from mlpack. + * Create and train Linear Regression model. + */ + LinearRegression lr(Xtrain, Ytrain, 0.5); + + arma::rowvec Ypreds; + lr.Predict(Xtest, Ypreds); + + /* + * Model Evaluation, + * To evaulate the model we use Mean Absolute Error (MAE) which + * is the sum of absolute differences between actual + * and predicted values, without considering the direction. + * MAE = \\frac{\\sum_{i=1}^n\\lvert y_{i} - \\hat{y_{i}}\\rvert} {n} + * Mean Squared Error (MSE) is calculated as the mean or average of the + * squared differences between predicted and expected target values in + * a dataset, a lower value is better + * MSE = \\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2, + * Root Mean Squared Error (RMSE), Square root of MSE yields root mean square + * error (RMSE) it indicates the spread of the residual errors. It is always + * positive, and a lower value indicates better performance. + * RMSE = \\sqrt{\\frac {1}{n} \\sum_{i=1}^n (y_{i} - \\hat{y_{i}})^2} + */ + // Model evaluation metrics. + // From the above metrics, we can notice that our model MAE is ~0.2, + // which is relatively small compared to our average price of $1.405, + // from this and the above plot we can conclude our model is a reasonably + // good fit. + + std::cout << "Mean Absolute Error: " + << arma::mean(arma::abs(Ypreds - Ytest)) << std::endl; + std::cout << "Mean Squared Error: " + << arma::mean(arma::pow(Ypreds - Ytest, 2)) << std::endl; + std::cout << "Root Mean Squared Error: " + << sqrt(arma::mean(arma::pow(Ypreds - Ytest, 2))) << std::endl; } From f541e411b1ba8edec1a6503d1d0a75efb4d32312 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 4 Oct 2024 13:06:45 +0200 Subject: [PATCH 15/18] Fix the bugs in california housing Signed-off-by: Omar Shrit --- .../california_house_price_prediction.cpp | 74 +++++-------------- 1 file changed, 18 insertions(+), 56 deletions(-) diff --git a/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp index 27d9fc1d..0c826a09 100644 --- a/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp +++ b/cpp/linear_regression/california_housing_price_prediction/california_house_price_prediction.cpp @@ -5,39 +5,7 @@ * * To predict California Housing Prices using the most simple Linear Regression * Model and see how it performs. To understand the modeling workflow using mlpack. - -* About the Data -* -* This dataset is a modified version of the California Housing dataset available -* from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the -* StatLib repository (which is closed now). The dataset may also be downloaded -* from StatLib mirrors. -* -* This dataset is also used in a book HandsOn-ML (a very good book and highly -* recommended: https://www.oreilly.com/library/view/hands-on-machine-learning/9781491962282/]). -* -* The dataset in this directory is almost identical to the original, with two -* differences: -* 207 values were randomly removed from the totalbedrooms column, so we can -* discuss what to do with missing data. An additional categorical attribute -* called oceanproximity was added, indicating (very roughly) whether each -* block group is near the ocean, near the Bay area, inland or on an island. -* This allows discussing what to do with categorical data. -* Note that the block groups are called \"districts\" in the Jupyter notebooks, -* simply because in some contexts the name \"block group\" was confusing. - -* Lets look at the features of the dataset: * -* Longitude : Longitude coordinate of the houses. -* Latitude : Latitude coordinate of the houses. -* Housing Median Age : Average lifespan of houses. -* Total Rooms : Number of rooms in a location. -* Total Bedrooms : Number of bedroooms in a location. -* Population : Population in that location. -* Median Income : Median Income of households in a location. -* Median House Value : Median House Value in a location. -* Ocean Proximity : Closeness to shore. - * Approach * * Here, we will try to recreate the workflow from the book mentioned above. @@ -51,23 +19,27 @@ #include using namespace mlpack; -using namespace mlpack::data; - - -//But, there's one thing which we need to do before loading the dataset -//as an Armadillo matrix; that is, we need to deal with any missing values. -//Since 207 values were removed from the original dataset from -//\"total_bedrooms_column\", we need to fill them using either -//\"mean\" or \"median\" of that feature (for numerical) and -//\"mode\" (for categorical). -// The imputing functions follows this -// Impute(inputFile, outputFile, kind) -// Here, inputFile is our raw file, outputFile is our new file with the imputations. -// and kind refers to imputation method. int main() { /** + * Dataset structure + * + * This dataset is a modified version of the California Housing + * dataset available from Luís Torgo's page (University of Porto). + * Luís Torgo obtained it from the StatLib repository. The dataset + * may also be downloaded from StatLib mirrors. + * + * Longitude : Longitude coordinate of the houses. + * Latitude : Latitude coordinate of the houses. + * Housing Median Age : Average lifespan of houses. + * Total Rooms : Number of rooms in a location. + * Total Bedrooms : Number of bedroooms in a location. + * Population : Population in that location. + * Median Income : Median Income of households in a location. + * Median House Value : Median House Value in a location. + * Ocean Proximity : Closeness to shore. + * * we need to load the dataset as an Armadillo matrix for further operations. * Our dataset has a total of 9 features: 8 numerical and * 1 categorical(ocean proximity). We need to map the @@ -80,17 +52,7 @@ int main() // Please remove the header of the file if exist, otherwise the results will // not work data::Load("../../../data/housing.csv", dataset, info); - dataset.brief_print(); - /* - * Did you notice something? Yes, the last row looks like it is entirely - * filled with '0'. Let's check our dataset to see what it corresponds to. - * It corresponds to Ocean Proximity which is a categorical value, but here - * it is zero. - * WWhy? It's because the load function loads numerical values only. This is - * exactly why we mapped Ocean proximity earlier. - * So, let's deal with this. - */ arma::mat encoded_dataset; // Here, we chose our pre-built encoding method "One Hot Encoding" to deal // with the categorical values. @@ -100,9 +62,9 @@ int main() arma::rowvec labels = arma::conv_to::from(encoded_dataset.row(8)); encoded_dataset.shed_row(8); + arma::mat trainSet, testSet; arma::rowvec trainLabels, testLabels; - // Split dataset randomly into training set and test set. data::Split(encoded_dataset, labels, trainSet, testSet, trainLabels, testLabels, 0.2 /* Percentage of dataset to use for test set. */); From 7248b4407352f2f1600641dd56ae15a335cb2f4c Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 4 Oct 2024 13:15:12 +0200 Subject: [PATCH 16/18] Clean a bit this example Signed-off-by: Omar Shrit --- .../salary_prediction/salary-prediction.cpp | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/cpp/linear_regression/salary_prediction/salary-prediction.cpp b/cpp/linear_regression/salary_prediction/salary-prediction.cpp index 9d5ee967..ef9d31b1 100644 --- a/cpp/linear_regression/salary_prediction/salary-prediction.cpp +++ b/cpp/linear_regression/salary_prediction/salary-prediction.cpp @@ -2,16 +2,12 @@ * Predicting Salary using Linear Regression. * Objective: * We have to predict the salary of an employee given how many years of experience they have. - * Dataset: - * Salary_Data.csv has 2 columns — “Years of Experience” (feature) and “Salary” - * (target) for 30 employees in a company. * Approach: * So in this example, we will train a Linear Regression model to learn the * correlation between the number of years of experience of each employee and * their respective salary. * Once the model is trained, we will be able to do some sample predictions. */ -// !wget -q https://datasets.mlpack.org/Salary_Data.csv" #include #include @@ -19,14 +15,16 @@ using namespace mlpack; int main() { - // Load the dataset into armadillo matrix. + /* Dataset: + * Salary_Data.csv has 2 columns — “Years of Experience” (feature) and “Salary” + * (target) for 30 employees in a company. + * + * Please if the data set contains header, please consider removing the + * header, before loading the dataset, otherwise Load function may not work + * correctly. + */ arma::mat inputs; data::Load("../../../data/Salary_Data.csv", inputs); - // Drop the first row as they represent header. - inputs.shed_col(0); - // Display the first 5 rows of the input data. - std::cout << std::setw(18) << "Years Of Experience" << std::setw(10) << "Salary" << std::endl; - std::cout << inputs.submat(0, 0, inputs.n_rows-1, 5).t() << std::endl; // Split the data into features (X) and target (y) variables // targets are the last row. @@ -71,12 +69,7 @@ int main() // Make predictions for test data points. arma::rowvec yPreds; lr.Predict(Xtest, yPreds); - - // Convert armadillo vectors and matrices to vector for plotting purpose. - std::vector XtestPlot = arma::conv_to>::from(Xtest); - std::vector yTestPlot = arma::conv_to>::from(yTest); - std::vector yPredsPlot = arma::conv_to>::from(yPreds); - + /* * Evaluation Metrics for Regression model. * In the Previous cell we have visualized our model performance by plotting. From 502b7174d07820eb13f3b989b608b623eff18156 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 4 Oct 2024 13:19:14 +0200 Subject: [PATCH 17/18] Add pandas Signed-off-by: Omar Shrit --- .ci/linux-steps.yaml | 2 +- .ci/macos-steps.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/linux-steps.yaml b/.ci/linux-steps.yaml index 7b5806b2..e7c7e3d1 100644 --- a/.ci/linux-steps.yaml +++ b/.ci/linux-steps.yaml @@ -65,7 +65,7 @@ steps: # Download datasets. - script: | - python -m pip install tqdm requests + python -m pip install tqdm requests pandas cd scripts/ ./download_data_set.py cd ../ diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml index d1aeb398..60665bc3 100644 --- a/.ci/macos-steps.yaml +++ b/.ci/macos-steps.yaml @@ -69,7 +69,7 @@ steps: # Download datasets. - script: | - python -m pip install tqdm requests + python -m pip install tqdm requests pandas cd scripts/ ./download_data_set.py cd ../ From 2dbbb970da946fc89f774aa7c5d355f9cec95323 Mon Sep 17 00:00:00 2001 From: Omar Shrit Date: Fri, 4 Oct 2024 17:22:17 +0200 Subject: [PATCH 18/18] Change armadillo version Signed-off-by: Omar Shrit --- .ci/macos-steps.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml index 60665bc3..5c0ac74d 100644 --- a/.ci/macos-steps.yaml +++ b/.ci/macos-steps.yaml @@ -21,15 +21,15 @@ steps: mkdir deps/ cd deps/ - # Install Armadillo 9.800.1 (the oldest supported version). - curl -O http://files.mlpack.org/armadillo-9.800.1.tar.gz - tar xvzf armadillo-9.800.1.tar.gz - cd armadillo-9.800.1 + # Install Armadillo 10.8.2 (the oldest supported version). + curl -O https://files.mlpack.org/armadillo-10.8.2.tar.gz + tar xvzf armadillo-10.8.2.tar.gz + cd armadillo-10.8.2 cmake . make sudo make install cd ../ - rm -rf armadillo-9.800.1/ + rm -rf armadillo-10.8.2/ # Build and install the latest version of ensmallen. curl -O https://www.ensmallen.org/files/ensmallen-latest.tar.gz