diff --git a/content/contributors/diegosanchezperez.md b/content/contributors/diegosanchezperez.md index 57f6c0df5..f08b4a38e 100644 --- a/content/contributors/diegosanchezperez.md +++ b/content/contributors/diegosanchezperez.md @@ -16,5 +16,5 @@ social: email: d.sanchezperez@tilburguniversity.edu image: diegosanchez.webp -status: "active" +status: "alumni" --- \ No newline at end of file diff --git a/content/contributors/fernandoiscar.md b/content/contributors/fernandoiscar.md index 6538d583f..5e3e80b93 100644 --- a/content/contributors/fernandoiscar.md +++ b/content/contributors/fernandoiscar.md @@ -15,5 +15,5 @@ social: email: f.iscar@tilburguniversity.edu image: fernandoiscar.webp -status: "active" +status: "alumni" --- \ No newline at end of file diff --git a/content/contributors/fleurlemire.md b/content/contributors/fleurlemire.md index f6fe588dd..ae62c5ca1 100644 --- a/content/contributors/fleurlemire.md +++ b/content/contributors/fleurlemire.md @@ -14,5 +14,5 @@ social: link: https://www.linkedin.com/in/fleurlemire/ email: f.m.d.lemire@tilburguniversity.edu image: fleur.webp -status: "active" +status: "alumni" --- diff --git a/content/contributors/malihehmahlouji.md b/content/contributors/malihehmahlouji.md new file mode 100644 index 000000000..acf1a781b --- /dev/null +++ b/content/contributors/malihehmahlouji.md @@ -0,0 +1,15 @@ +--- +name: "Maliheh Mahlouji" +description_short: "Data Steward at Tilburg University." +description_long: "I love coding and data science. I support researchers at Tilburg University with their data intensive problems" +skills: + - Python + - SQL + - Data Science +social: + - name: LinkedIn + link: www.linkedin.com/in/maliheh-mahlouji-271b12112 +email: m.mahlouji@tilburguniversity.edu +image: +status: "active" +--- diff --git a/content/contributors/matteozicari.md b/content/contributors/matteozicari.md index 64975909a..6f924e0da 100644 --- a/content/contributors/matteozicari.md +++ b/content/contributors/matteozicari.md @@ -14,6 +14,6 @@ social: link: https://www.linkedin.com/in/matteozicari/ email: m.zicari@tilburguniversity.edu image: matteozicari.webp -status: "active" +status: "alumni" --- diff --git a/content/contributors/roshinisudhaharan.md b/content/contributors/roshinisudhaharan.md new file mode 100644 index 000000000..4f04197f8 --- /dev/null +++ b/content/contributors/roshinisudhaharan.md @@ -0,0 +1,18 @@ +--- +name: "Roshini Sudhaharan" +description_short: "As a graduate research assistant, I lead our team of research assistants and manage causal inference content development." +description_long: "As a budding researcher, I am deeply passionate about deriving rich insights from unstructured data, employing Natural Language Processing and machine learning techniques, alongside causal inference methods, for informed decision making. Joining TSH in 2021 at the outset of my research journey was transformative, exposing me to the power of open science best practices. I am committed to inspiring the community by sharing cutting-edge tools that enhance efficiency and align with opens science principles." + +skills: + - R + - Python + - Causal inference + - Natural Language Processing + - Cloud Computing +social: + - name: LinkedIn + link: https://www.linkedin.com/in/roshinisudhaharan/ +email: r.sudhaharan@tilburguniversity.edu +image: roshinisudhaharan.webp +status: "active" +--- diff --git a/content/topics/Analyze/Non-parametric-tests/_index.md b/content/topics/Analyze/Non-parametric-tests/_index.md index bd61aaa39..2e3f2ca9e 100644 --- a/content/topics/Analyze/Non-parametric-tests/_index.md +++ b/content/topics/Analyze/Non-parametric-tests/_index.md @@ -1,5 +1,5 @@ --- draft: false -title: "Non Parametric Tests" +title: "Tests" weight: 1 --- diff --git a/content/topics/Analyze/Non-parametric-tests/tests/_index.md b/content/topics/Analyze/Non-parametric-tests/tests/_index.md new file mode 100644 index 000000000..5561cd941 --- /dev/null +++ b/content/topics/Analyze/Non-parametric-tests/tests/_index.md @@ -0,0 +1,5 @@ +--- +draft: false +title: "Non-parametric Tests" +weight: 1 +--- diff --git a/content/topics/Analyze/causal-inference/did/_index.md b/content/topics/Analyze/causal-inference/did/_index.md index 5b74ff8da..999cb0bbd 100644 --- a/content/topics/Analyze/causal-inference/did/_index.md +++ b/content/topics/Analyze/causal-inference/did/_index.md @@ -1,6 +1,6 @@ --- draft: false title: "Difference in Difference" -weight: 2 +weight: 3 --- \ No newline at end of file diff --git a/content/topics/Analyze/causal-inference/did/canonical-did-regression.md b/content/topics/Analyze/causal-inference/did/canonical-did-regression.md index 68a60cd7e..1290578ff 100644 --- a/content/topics/Analyze/causal-inference/did/canonical-did-regression.md +++ b/content/topics/Analyze/causal-inference/did/canonical-did-regression.md @@ -1,16 +1,18 @@ --- -title: "Canonical Difference-in-Difference as a Regression" -description: "This building block walks you through DiD as a regression, motivates the use of Two-Way Fixed Effects (TWFE) and clustered standard errors " +title: "Difference-in-Difference as a Regression" +description: "This topic walks you through DiD as a regression, motivates the use of Two-Way Fixed Effects (TWFE) and clustered standard errors " keywords: "causal inference, difference-in-difference, DID, R, regression, model, canonical DiD, difference in means table, potential outcomes framework, average treatment effect, ATE, ATT, ATU, treatment effects, regression, TWFE, clustered standard errors" draft: false -weight: 12 +weight: 2 author: "Roshini Sudhaharan" authorlink: "https://nl.linkedin.com/in/roshinisudhaharan" aliases: - /canonical-DiD - /canonical-DiD/run --- -# Overview + + +## Overview In the context of non-feasible randomized controlled experiments, we [previously](/canonical-DiD) discussed the importance of the difference-in-difference (DiD) approach for causal inference. While calculating treatment effects using the difference-in-means method is a starting point, it lacks sufficient grounds for reliable inference. To obtain more robust results, it is crucial to estimate treatment effects through regression analysis with the appropriate model specification. Regression models allow for controlling confounding variables, accounting for unobserved heterogeneity, and incorporating fixed effects, leading to more accurate and meaningful interpretations of treatment effects. Next, we’ll dig a little deeper into the merits of the regression approach and how to carry out the estimation in R using an illustrative example. @@ -29,27 +31,31 @@ In the context of non-feasible randomized controlled experiments, we [previously By considering the context and benefits outlined above, the regression approach proves to be advantageous for assessing causal relationships. It enables us to obtain standard errors, account for additional control variables, and interpret treatment effects in a meaningful way, contributing to a more comprehensive and robust analysis of the treatment's impact. ### An illustrative example (Continued) -In the [previous building block](/canonical-DiD), we introduced an example to illustrate how to obtain the difference-in-means table for a 2 x 2 DiD design. This example looks into the effect of the Q&A on subsequent ratings using a cross-platform identification strategy with Goodreads as the treatment and Amazon as the control group. -Since we have 2 groups (Amazon vs Goodreads) and 2 time periods (pre Q&A and post Q&A), we use the canonical 2 x 2 DiD design. This can be estimated with the following regression equation. You can find all the analysis code in this [Gist](https://gist.github.com/srosh2000/f52600b76999e88f0fe316e8f23b419e). +In the [previous topic](/canonical-DiD), we introduced an example to illustrate how to obtain the difference-in-means table for a 2 x 2 DiD design. This example looks into the effect of the Q&A on subsequent ratings using a cross-platform identification strategy with Goodreads as the treatment and Amazon as the control group. We have 2 groups (Amazon vs Goodreads) and 2 time periods (pre Q&A and post Q&A). + +The effect can be estimated with the following regression equation. You can find all the analysis code in this [Gist](https://gist.github.com/srosh2000/f52600b76999e88f0fe316e8f23b419e). {{}} {{}} -$$ -rating_{ijt} = \alpha+ \lambda POST_{ijt}+\gamma Goodreads+\delta (POST_{ijt}* Goodreads_{ij})+\eta_j +\tau_t+\epsilon_{ijt} -$$ -where, -$POST$: is a dummy that equals 1 if the observation is after Q&A launch -$Goodreads$: is a dummy equal to 1 if the observation is from the Goodreads platform and 0 if from Amazon - -$\eta$: book fixed effects +
+{{}} -$\tau$: time fixed effects +rating_{ijt} = \alpha+ \lambda POST_{ijt}+\gamma Goodreads + \\ +\delta (POST_{ijt}* Goodreads_{ij})+\eta_j +\tau_t+\epsilon_{ijt} +{{}} +
+
+where, +- $POST$ is a dummy equal to 1 if the observation is after Q&A launch +- $Goodreads$ is a dummy equal to 1 if the observation is from the Goodreads platform and 0 if from Amazon +- $\eta$: Book fixed effects +- $\tau$: Time fixed effects -Before estimating the regression, it is crucial to check whether the **parallel trends assumption** holds which suggests that , in the absence of the treatment, both the treatment and control groups would have experienced the same outcome evolution. We also implicitly assume that the **treatment effect is constant** between the groups over time. Only then can we interpret the DiD estimator (treatment effect) as unbiased. +Before estimating the regression, it is crucial to check whether the **parallel trends assumption** holds which suggests that, in the absence of the treatment, both the treatment and control groups would have experienced the same outcome evolution. We also implicitly assume that the **treatment effect is constant** between the groups over time. Only then can we interpret the DiD estimator (treatment effect) as unbiased. To check for parallel trends, we can visualise the average outcome for both groups over time before and after the treatment. @@ -70,9 +76,7 @@ tidy(model_1, conf.int = TRUE) ``` {{% /codeblock %}} - -However, there might be time-varying and group-specific factors that may affect the outcome variable which requires us to estimate a two-way fixed effects (TWFE) regression. Check out [this building block](/withinestimator) to learn more about the TWFE model. - +However, there might be time-varying and group-specific factors that may affect the outcome variable which requires us to estimate a two-way fixed effects (TWFE) regression. Check out [this topic](/withinestimator) to learn more about the TWFE model, and [this topic](/fixest) to learn more about the `fixest` package. {{% codeblock %}} ```R @@ -113,7 +117,6 @@ tidy(model_4, conf.int = TRUE) ``` {{% /codeblock %}} - Clustering standard errors recognizes that observations within a cluster, such as products or books, may be more similar to each other than to observations in other clusters. This correlation arises due to *unobserved* factors specific to each cluster that can affect the outcome variable. Failure to account for this correlation by clustering standard errors may result in incorrect standard errors, leading to invalid hypothesis tests and confidence intervals. Now let’s wrap up this example by comparing all the regression results obtained so far. @@ -219,15 +222,13 @@ Standard errors are in parentheses. '*' Significant at the 10 percent level. {{% tip %}} -Use the `modelsummary` package to summarise and export the regression results neatly and hassle-free. Check out this [building block](https://tilburgsciencehub.com/topics/analyze-data/regressions/model-summary/) to help you get started! +Use the `modelsummary` package to summarise and export the regression results neatly and hassle-free. Check out this [topic](https://tilburgsciencehub.com/topics/analyze-data/regressions/model-summary/) to help you get started! {{% /tip %}} In model 4, the estimated treatment effect is substantially larger compared to the previous models, emphasizing the significance of selecting an appropriate model specification. By incorporating fixed effects and clustering standard errors, we effectively control for potential unobserved heterogeneity, ensuring more reliable and valid inference. The inclusion of fixed effects allows us to account for time-invariant factors that may confound the treatment effect, while clustering standard errors addresses the within-cluster dependence commonly encountered in Difference-in-Differences (DiD) designs. This improved model specification enhances the robustness of the estimated treatment effect and strengthens the validity of our conclusions, emphasizing the importance of these methodological considerations in conducting rigorous empirical analyses. {{% summary %}} - - - The regression approach in the difference-in-difference (DiD) analysis offers several **advantages**: obtain standard errors, include control variables and perform log transformation on the dependent variable. - Time and group fixed effects can be incorporated in the regression analysis to account for time-varying and group-specific factors that may affect the outcome variable. We carry out this **two-way fixed effects (TWFE)** estimation using the `feols()` function from the `fixest` package. - Clustering standard errors is important in DiD designs to address potential correlation or dependence within clusters of data. This can be done using the `cluster` argument. diff --git a/content/topics/Analyze/causal-inference/did/canonical-did-table.md b/content/topics/Analyze/causal-inference/did/canonical-did-table.md index 5ee15babf..4ebe58cff 100644 --- a/content/topics/Analyze/causal-inference/did/canonical-did-table.md +++ b/content/topics/Analyze/causal-inference/did/canonical-did-table.md @@ -1,92 +1,49 @@ --- title: "An Introduction to Difference-in-Difference Analysis" -description: "This building block motivates causal inference, provides theoretical background for the difference-in-difference method and an example that walks you through how to set up the data to compute difference in means table for the 2 x 2 case. " +description: "This topic provides background and motivation for the difference-in-difference method and an example that walks you through how to set up the data to compute difference-in-means table for the 2 x 2 case." keywords: "causal inference, difference-in-difference, DID, R, regression, model, canonical DiD, difference in means table, potential outcomes framework, average treatment effect, ATE, ATT, ATU, treatment effects" draft: false -weight: 9 -author: "Roshini Sudhaharan" -authorlink: "https://nl.linkedin.com/in/roshinisudhaharan" +weight: 1 +author: "Roshini Sudhaharan, Valerie Vossen" aliases: - /canonical-DiD --- -# Overview -Having the ability to establish causality provides a strong foundation for using the term "because" when making claims about the relationship between certain variables. We are then able to say for instance, that the popularity of a song increased *because* it was featured in a movie or that the customer bought a given product *because* they saw an ad and so on. These relationships are not inherently causal, as the direction of cause and effect may be ambiguous, other variables can influence the relationship, or the observed outcome may be due to chance. While regression analysis provides some insight into the significance of associations, interpreting the results as causal relationships requires additional assumptions and thoughtful study designs. -

- -

Source: xkcd (https://xkcd.com/552/)
-

- -While randomized controlled experiments are widely regarded as the gold standard for establishing causality, they may not always be practical, feasible, or ethically permissible. In such circumstances, quasi-experimental methods offer a valuable alternative by leveraging naturally occurring variations in treatment or exposure to approximate controlled experimental conditions. One such prominent quasi-experimental method is known as **Difference-in-Differences** (DiD). This robust statistical approach is extensively used to evaluate the causal impact of a treatment or intervention. It allows estimation of the treatment effect by comparing changes in outcomes between a treatment group and a control group before and after the implementation of the treatment. +## Overview -## DiD: Theoretical Background +While randomized controlled experiments are considered the gold standard for establishing causality, they aren't always practical, feasible, or ethically permissible. In such cases, quasi-experimental methods offer a valuable alternative, using natural variations in treatment or exposure to approximate controlled experimental conditions. -### Potential Outcomes Framework +{{% tip %}} +You can find an introduction to causal inference [here](/causal-inference-intro). +{{% /tip %}} -{{}} -{{}} -Before delving into the Difference-in-Differences design, it is essential to grasp the significance of the **potential outcomes** framework, which serves as the lingua franca of causal inference. In this framework, each unit has two potential outcomes: $Y_i^1$ if the unit receives treatment and $Y_i^0$ if not. However, we can only observe one of these potential outcomes. The observable outcome, denoted by $Y_i$ is determined by a *switching equation*: +*Difference-in-Differences (DiD)* is one such quasi-experimental method commonly used to evaluate the causal impact of a treatment or intervention. The DiD framework is used in a wide range of settings, from evaluating the impact of a rise in the [minimum wage](https://econpapers.repec.org/article/aeaaecrev/v_3a84_3ay_3a1994_3ai_3a4_3ap_3a772-93.htm) to [assessing how the adoption of online streaming affects music consumption and discovery](https://pubsonline.informs.org/doi/pdf/10.1287/mksc.2017.1051). -$$ -Y_i = D_iY_i^1+(1-D_i)Y_i^0 -$$ - -where $D_i = 1$ if the unit is treated and $0$ if not. When $D_i = 1$, $Y_i = Y_i^1$, likewise when $D_i = 0$, $Y_i = Y_i^0$. - -### Average Treatment Effect +{{% summary %}} -While we have previously defined *individual* treatment effect, researchers are often interested in the *average treatment effect:* +DiD estimates the treatment effect by comparing changes in outcomes between a treatment and control group before and after the treatment is implemented. In other words, it examines the *difference in the difference* between these groups. -
-{{}} -ATE = E[\delta_i] \\ -= E[Y_i^1- Y_i^0]\\ -= E[Y_i^1]-E[Y_i^0]\\ -{{}} -
+{{% /summary %}} -Here, we compare potential outcomes when *all* units receive treatment with the potential outcomes when no units receive treatment. +## Setup -Again, since ATE requires one to know BOTH potential outcomes but we observe only one it is unobserved but can be *estimated*. Now, let’s consider the event that individuals might self-select into the treatment which allows us to estimate the *Average Treatment effect on the Treated* units (ATT): +Suppose we are studying the effectiveness of a new educational program (the *treatment*) on students' test scores. The treatment group consists of students who received the program ($D_i = 1$), and the control group of students who did not receive the program ($D_i = 0$). The setup table for DiD is: -
{{}} -ATT = E[\delta_i|D_i=1] \\ -= E[Y_i^1|D_i=1] - E[Y_i^0|D_i=1] {{}} -
- -Similarly, the average treatment effect for the untreated/control group is called *Average Treatment for the Untreated* (ATU): - - -$$ -ATU = E[\delta_i|D_i=0] -$$ -In the DiD setup, we are mainly interested in comparing the outcomes before and after the treatment for the treated and control groups: - | | Before (\$Y_i^0\$) | After (\$Y_i^1\$) | | --------------- | ---------------------- | ---------------------- | | Control (\$D_i = 0\$) | \$E(Y_i^0\|D_i = 0)\$ | \$E(Y_i^1\|D_i = 0)\$ | | Treatment (\$D_i=1\$) | \$E(Y_i^0\|D_i = 1)\$ | \$E(Y_i^1\|D_i = 1)\$ | -Part of the outcomes presented above are *counterfactual*. These outcomes represent what would have happened to the treated and control groups if their treatment statuses were reversed. Naturally, a single person cannot be both treated and not treated, that’s why we only observe one of the two values. - -For more intuition, suppose we are studying the effectiveness of a new educational program (treatment) on students’ test scores. We have a control group of students who did not receive the program (when $D_i = 0$) and a treatment group of students who did receive the program (when $D_i = 1$). - -The counterfactual outcomes help us understand what would have happened if the treatment status were reversed for each group: -- For the *control* group (when $D_i = 0$): The counterfactual outcome represents the expected test scores for the control group if they had received the treatment. This is denoted as $E(Y_i^1|D_i = 0)$. It provides an estimate of what their test scores would have been if they had been part of the treatment group. -- For the *treatment* group (when $D_i = 1$): The counterfactual outcome represents the expected test scores for the treatment group if they had not received the treatment. This is denoted as $E(Y_i^0|D_i = 1)$. It gives us an estimate of what their test scores would have been if they had been part of the control group instead. +Some outcomes are *counterfactual*, representing what would have happened if treatment statuses were reversed. Naturally, an individual cannot simultaneously be treated and not treated. Therefore, we only observe one of the two potential outcomes for each person after treatment implementation: the treated outcome for those who received treatment and the untreated outcome for those who did not. -Now imagine for a while we can move between alternative realities and can observe the same person’s outcome in both scenarios: when they are treated and when they are not. +One might think of simply taking the difference in the average outcome before and after for the treatment group ($Y_1^0-Y_1^1$). However, this approach fails because the treatment occurrence and time are perfectly correlated, making it impossible to isolate the treatment effect from the temporal effects. Similarly, comparing the average outcomes of the treated and control groups in the after period ($Y_1^1-Y_0^1$) is problematic because natural differences between the groups may confound results. The group receiving the program is for example on average more motivated independent of receiving the treatment. -At a first glance, one might think of simply taking the difference in the average outcome before and after for the treatment group ($Y_1^0-Y_1^1$). However, this approach fails because the occurrence of treatment and time are perfectly correlated, making it impossible to isolate the treatment effect from the temporal effects. - -Similarly, taking the difference between the average outcomes of the treated and control groups in the after period ($Y_1^1-Y_0^1$) won’t work either. This is because the treatment group might naturally differ from the control group, leading to differences in outcomes that are not solely due to the treatment effect. - -To address these issues, we combine both ideas and perform **double differencing**, which is why it is called *difference-in-difference.* +To address these issues, double differencing (*difference-in-difference*) is done, which involves taking differences between the outcomes of the control group from those of the treatment group before and after treatment to estimate the treatment effect. | | Before | After | After - Before | | --- | --- | --- | --- | @@ -94,12 +51,21 @@ To address these issues, we combine both ideas and perform **double differencing | **Treatment** | $\alpha + \gamma$ | $\alpha+\gamma+\lambda+\delta$ | | | **Treatment** - **Control** | $\gamma$ | $\gamma+\delta$ | $\delta$ | -By taking the difference between the treatment and control groups’ outcomes before treatment ($\gamma$) and the difference between their outcomes after treatment ($\gamma + \delta$), we can obtain the final treatment effect ($\delta$). -## Assumptions for Causality +By taking the difference between the treatment and control groups’ outcomes before treatment ($\gamma$) and the difference between their outcomes after treatment ($\gamma + \delta$), we can obtain the final treatment effect ($\delta$), which is the additional change in outcome for the treatment group compared to the control group after the treatment is implemented. + +{{% tip %}} +$\gamma$ represents any differences in outcomes between the treatment and control groups, that exist before the treatment is implemented. $\lambda$ represents the common change in outcomes over time that affects both the treatment and control groups equally. +{{% /tip %}} + + +## Identifying assumptions To establish causality using the DiD design, two key assumptions are necessary: -1. **Parallel Trends Assumption**: The parallel trends assumption states that, in the absence of treatment, the treated and control groups would have followed similar trends over time. This assumption implies that any differences in outcomes between the two groups before treatment can be attributed to pre-existing differences, as the trends are assumed to be parallel. It ensures the existence of a valid counterfactual outcome to compare the treated group. +1. **Parallel Trends Assumption** + +In the absence of treatment, the treatment and control groups would have followed similar trends over time. It ensures that any differences in outcomes observed before the treatment between the two groups are due to pre-existing differences, as the trends are assumed to be parallel. + More formally, it assumes that the difference between the control group's counterfactual outcome ($E(Y_i^1|D_i = 0)$) and its observed outcome before treatment ($E(Y_i^0|D_i = 0)$) would remain constant over time. Similarly, it assumes that the difference between the treatment group's observed outcome after treatment ($E(Y_i^1|D_i = 1)$) and its counterfactual outcome ($E(Y_i^0|D_i = 1)$) would also remain constant over time.

@@ -107,16 +73,22 @@ More formally, it assumes that the difference between the control group's counte

Parallel trends visualisation, image by the author

-2. **Conditional Independence (Unconfoundedness) Assumption**: The conditional independence assumption states that, conditional on observed variables, the treatment assignment is independent of potential outcomes. This assumption ensures that there are no confounding factors that simultaneously affect both the treatment assignment and the potential outcomes. It implies that, after controlling for observable covariates, the treatment assignment is random with respect to potential outcomes. Controlling for observable covariates helps reduce the potential for bias due to omitted variables confounding. +2. **Conditional Independence (Unconfoundedness) Assumption** + +Conditional on observed variables, the treatment assignment is independent of potential outcomes. -Now that we have covered all the basics, let's jump to an example to put all this theory to practice! +This ensures that there are no confounding factors that simultaneously affect both the treatment assignment and the potential outcomes. After controlling for observable covariates, the treatment assignment is then random with respect to potential outcomes. + +Now that we have covered all the basics, let's jump to an example to put all this theory into practice! + + +## Example: The 2x2 DiD Table -## An Illustrative Example: The Canonical 2x2 DiD Table On May 21, 2014, Goodreads, a social cataloging website for books, introduced the Q&A feature, enabling users to post questions to authors and fellow readers. It is intriguing to explore the potential impact of this feature on subsequent book ratings. For example, users now have the opportunity to proactively inquire about a book before making a purchase. This interaction may help alleviate uncertainties regarding the book's suitability, potentially reducing customer dissatisfaction and mitigating the likelihood of negative reviews. The dataset used in this example includes information on book ratings and their corresponding dates from both Goodreads and Amazon. In this analysis, we will treat the implementation date of the Q&A policy as the treatment and assess its impact on subsequent book ratings through the employment of the DiD approach. As Amazon lacks the Q&A feature specifically for the book category but still permits consumers to rate and review books akin to Goodreads, it serves as a suitable control group for comparison. You can find the complete data preparation and analysis code in this [Gist](https://gist.github.com/srosh2000/f52600b76999e88f0fe316e8f23b419e) to follow along. -Since we have 2 groups (Amazon Vs Goodreads) and 2 time periods (pre Q&A and post Q&A), we use the canonical 2 x 2 DiD design. +Since we have 2 groups (Amazon Vs Goodreads) and 2 time periods (pre-Q&A and post-Q&A), we use the canonical 2 x 2 DiD design. {{% codeblock %}} ```R @@ -142,7 +114,7 @@ print(means_modified) ``` {{% /codeblock %}} -Here, we compute the average rating for the treatment and control group both before and after the Q&A launch based on the `GoodAma` dataframe. The treatment dummy, `goodr = 1` indicates the *treatment* group (Goodreads), and otherwise the *control* group (Amazon). The time dummy, `qa = 1` indicates the *after* period and `0` for *before* period. +Here, we compute the average rating for the treatment and control group both before and after the Q&A launch based on the `GoodAma` dataframe. The treatment dummy, `goodr = 1` indicates the *treatment* group (Goodreads), and otherwise the *control* group (Amazon). The time dummy, `qa = 1` indicates the *after* period and `0` for the *before* period.

@@ -174,23 +146,16 @@ print(paste("Diff in Diff Estimate: " , did))

Treatment effect value

-The obtained results indicate that the presence of the Q&A feature is associated with lower ratings. However, it is crucial to acknowledge that the differences in means table presented above is merely suggestive and should be interpreted with caution. To obtain more accurate and reliable estimates, it is essential to conduct a regression analysis that allows us to control for other variables that may be related to the outcome and treatment, thereby providing a more comprehensive understanding of the relationship. - -Utilising regression analysis not only enhances the precision of the estimates but also allows us to examine the statistical significance of the results. This helps determine the strength and reliability of the observed relationship between the Q&A feature and ratings. Additionally, regression analysis enables the exploration of potential interactions or nonlinear effects, providing further insights into the complex dynamics at play. +The obtained results indicate that the presence of the Q&A feature is associated with lower ratings. +Note that the differences-in-means table is merely suggestive and should be interpreted with caution. +To obtain more accurate and reliable estimates, it is essential to conduct a regression analysis, which allows to control for variables and examine the statistical significance of the results. Refer to the [next topic](/canonical-DiD/run) to continue with this example as a DiD regression! {{% summary %}} -- **Difference-in-Difference** (DiD) is a powerful statistical method for evaluating the causal impact of a treatment or intervention. - -- DiD compares changes in outcomes between a treatment group and a control group before and after the treatment is implemented. - -- **Average Treatment Effect** (ATE) represents the average effect of treatment on all units. - -- **Average Treatment Effect on the Treated** (ATT) focuses on the treated units, while **Average Treatment Effect for the Untreated** (ATU) focuses on the control group. - -- The **parallel trends** assumption and **conditional independence** assumption are crucial for a causal interpretation of the estimates. +**Difference-in-Difference** (DiD) is a powerful statistical method for evaluating the causal impact of a treatment or intervention. It compares changes in outcomes between a treatment group and a control group before and after the treatment is implemented. Assumptions crucial for a causal interpretation of the estimates are the *parallel trends* assumption and the *conditional independence* assumption. +An example is given to illustrate how to obtain the difference-in-means table for a 2x2 DiD design. To learn how to estimate the effect with regression analysis, refer to [the next topic](/canonical-DiD/run). {{% /summary %}} diff --git a/content/topics/Analyze/causal-inference/did/impact-evaluation.md b/content/topics/Analyze/causal-inference/did/impact-evaluation.md index d705efb31..cc0f55d0a 100644 --- a/content/topics/Analyze/causal-inference/did/impact-evaluation.md +++ b/content/topics/Analyze/causal-inference/did/impact-evaluation.md @@ -1,8 +1,8 @@ --- -title: "Impact evaluation with Difference-in-Differences and Regression Discontinuity" +title: "Impact evaluation: RD vs DID" description: "Use Difference-in-Differences and Regression Discontinuity Design to evaluate impacts of quasi-experiments" keywords: "regression, model, DiD, RD, impact evaluation, inference, quasi-experiment" -weight: 1 +weight: 3 date: 2023-10-01T01:00:00+01:00 draft: false aliases: @@ -11,9 +11,8 @@ aliases: - /run/RD - /regression/discontinuity --- -# Impact Evaluation -## Why? +# Overview Many programs are designed to improve outcomes such as learning, health or productivity. Have resources been spent wisely on the program? Did the program/policy work? These are the questions that impact evaluation answers, based on evidence. @@ -21,10 +20,6 @@ In this building block we discuss two of the most commonly used impact evaluatio ## Difference-in-Differences -{{% summary %}} -DiD **compares the changes in outcomes** (e.g. productivity) **over time** between a population that is enrolled in a program (the **treatment group**, e.g. employees who take an IT training) and a population that is not (the **comparison group**, e.g. employees who do not take the IT training). -{{% /summary %}} - ### When should I apply DiD? The DiD framework is used in a wide range of settings, from evaluating the impact of a rise in the [minimum wage](https://econpapers.repec.org/article/aeaaecrev/v_3a84_3ay_3a1994_3ai_3a4_3ap_3a772-93.htm) to assessing how the [adoption of online streaming affects music consumption and discovery](https://tiu.nu/spotify). @@ -81,15 +76,11 @@ The following example comes from [Gertler, Martinez, Premand, Rawlings, and Verm ``` {{% /codeblock %}} - -## Regression Discontinuity - {{% summary %}} -RD is used to estimate the effect of a program or treatment in which candidates are selected for treatment based on whether their value for a numeric index is above or below a certain cutoff point. - - - This method is broadly used in social programs. For instance, antipoverty programs where individuals under a certain poverty index receive help from the government or for scholarships targeted at students that obtain at least a certain grade. +DiD **compares the changes in outcomes** (e.g. productivity) **over time** between a population that is enrolled in a program (the **treatment group**, e.g. employees who take an IT training) and a population that is not (the **comparison group**, e.g. employees who do not take the IT training). {{% /summary %}} +## Regression Discontinuity ### When should I apply RD? @@ -194,7 +185,11 @@ We use the same example as before but we now assess whether the program has mana ``` {{% /codeblock %}} +{{% summary %}} +RD is used to estimate the effect of a program or treatment in which candidates are selected for treatment based on whether their value for a numeric index is above or below a certain cutoff point. + - This method is broadly used in social programs. For instance, antipoverty programs where individuals under a certain poverty index receive help from the government or for scholarships targeted at students that obtain at least a certain grade. +{{% /summary %}} ## Additional resources @@ -202,4 +197,5 @@ We use the same example as before but we now assess whether the program has mana - Want to learn more on how to implement these methods and others in R? Check out this [website](https://bookdown.org/aschmi11/causal_inf/regression-discontinuity.html) or [this one](https://bookdown.org/ccolonescu/RPoE4/). +# Reference *Gertler, Paul J., Sebastian Martinez, Patrick Premand, Laura B. Rawlings, and Christel M. J. Vermeersch. 2016. Impact Evaluation in Practice, second edition. Washington, DC: Inter-American Development Bank and World Bank. doi:10.1596/978-1-4648-0779-4.* diff --git a/content/topics/Analyze/causal-inference/did/synth-control.md b/content/topics/Analyze/causal-inference/did/synth-control.md index c0d114a2b..2ded6d630 100644 --- a/content/topics/Analyze/causal-inference/did/synth-control.md +++ b/content/topics/Analyze/causal-inference/did/synth-control.md @@ -2,7 +2,7 @@ title: "Synthetic Control for Impact Evaluation" description: "Use Synthetic control to evaluate impacts of quasi-experiments" keywords: "model, Synthetic Control, RD, impact evaluation, inference, quasi-experiment, abadie" -weight: 9 +weight: 4 date: 2023-10-01T01:00:00+01:00 draft: false aliases: diff --git a/content/topics/Analyze/causal-inference/introduction/_index.md b/content/topics/Analyze/causal-inference/introduction/_index.md new file mode 100644 index 000000000..ac4614561 --- /dev/null +++ b/content/topics/Analyze/causal-inference/introduction/_index.md @@ -0,0 +1,6 @@ +--- +draft: false +title: "Introduction" +weight: 1 + +--- \ No newline at end of file diff --git a/content/topics/Analyze/causal-inference/introduction/causal-inference-intro.md b/content/topics/Analyze/causal-inference/introduction/causal-inference-intro.md new file mode 100644 index 000000000..fd6981637 --- /dev/null +++ b/content/topics/Analyze/causal-inference/introduction/causal-inference-intro.md @@ -0,0 +1,131 @@ +--- +title: "Introduction to Causal Inference" +description: "This topic provides an introduction to causal inference, serving as a background for all the practical methods you can find in this section." +keywords: "causal, inference, econometrics, regression, model, potential, outcomes, framework, treatment, effect, control, ATE, ATT" +draft: false +weight: 1 +author: "Roshini Sudhaharan, Valerie Vossen" +aliases: + - /causal-inference + - /causal-inference-introduction + - /causal-inference-intro +--- + +## Overview + +Having the ability to establish causality provides a strong foundation for using the term "because" when making claims about the relationship between certain variables. We are then able to say for instance, that the popularity of a song increased *because* it was featured in a movie or that the customer bought a given product *because* they saw an ad and so on. These relationships are not inherently causal, as the direction of cause and effect may be ambiguous, other variables can influence the relationship, or the observed outcome may be due to chance. While regression analysis provides some insight into the significance of associations, interpreting the results as causal relationships requires additional assumptions and thoughtful study designs. + + +

+ +

Source: xkcd (https://xkcd.com/552/)
+

+ +## Potential outcomes framework + +{{}} +{{}} + +The **potential outcomes** framework is central to understanding causal inference and the role of randomization. In this framework, each unit has two potential outcomes: +- $Y_i^1$ if the unit receives treatment and +- $Y_i^0$ if it did not + +The observable outcome $Y_i$ is determined by a switching equation: + +$$ +Y_i = D_i Y_i^1 +(1-D_i) Y_i^0 +$$ + +Here, $D_i$ is the treatment indicator, taking the value 1 if the unit is treated and $0$ if not. This equation indicates that: +- When $D_i = 1$, the observable outcome $Y_i$ is $Y_i^1$ and +- when $D_i = 0$, the observable outcome $Y_i$ is $Y_i^0$. + +The core of causal inference lies in comparing these potential outcomes. To know the individual treatment effect, you would want to take the difference $Y_i^1$ - $Y_i^0$. However, this is not possible since only one of these outcomes is observed. This challenge arises from the unobserved counterfactual: we cannot see what would have happened under the alternative scenario. + +## Randomization + +Randomization is key to isolating causal effects. +By randomly assigning some units to treatment and some units to control, the mean of a random sample from the population of units is then an unbiased estimator for the mean of the population. The *counterfactual* is based on random selection into treatment. However, random interventions are often infeasible or unethical. Causal inference bridges the gap between the observational data and the hypothetical scenario in which randomization took place. + +## Selection bias + +If a treatment group is NOT randomly selected and treated individuals self-select into treatment, selection bias arises. This means the treatment group would have a different outcome than the control group even without treatment. Consequently, the estimated *average* treatment effect may be over- or underestimated. + +As a simple example, say you are interested in the effect of extra advanced classes on final grades. Just comparing the average grades of the group that signed up for these classes (the *treatment group*) with that of the group that did not (the *control group*) may overestimate the effect. This bias comes from different characteristics between these groups that are confounding the treatment effect. For example, the treatment group may on average be more motivated already, which overestimates the effect the classes have on grades. + +The estimated treatment effect thus consists of two parts: *the unit causal effect* + *the selection bias* (the second line) = + +
+{{}} + +E(Y_i^1 |D_i = 1) - E(Y_i^0 | D_i = 1) + \\ (E(Y_i^0 | D_i = 1) - E(Y_i^0 | D_i = 0)) + +{{}} +
+
+ +1. *Unit causal effect*: Difference in outcome for treated units and untreated units +2. *Selection bias*: Difference in outcome between the treatment and control group when neither is treated. + +### Average treatment effects + +Following this reasoning, three parameters are distinguished. Depending on your research question, all three may be of interest, though the first two are typically the most important. + +1. The *Average Treatment Effect (ATE)* is the difference between the potential outcomes if all units receive treatment and the potential outcomes when no units receive treatment. In mathematical terms: + +
+{{}} +ATE = E[\delta_i] = E[Y_i^1- Y_i^0] =\\ +E[Y_i^1]-E[Y_i^0]\\ +{{}} +
+
+ +Again, since ATE requires one to know BOTH potential outcomes but we observe only one it is unobserved but can be estimated. + +2. The *Average Treatment Effect on the Treated Units (ATT)* is the treatment effect for *only* the treatment group. It is likely to be different from the ATE if individuals self-select into treatment, and therefore treatment units differ from control units in other characteristics. In our example, it is the effect of attending advanced classes on final grades for the group that attended these classes. + +
+{{}} +ATT = E[\delta_i|D_i=1] = \\ +E[Y_i^1|D_i=1] - E[Y_i^0|D_i=1] +{{}} +
+ +3. The *Average Treatment Effect for the Untreated (ATU)* is the treatment effect for those individuals in the control group. The ATU is likely to be different from the ATT, if self-selection made the treatment and control group differ on certain characteristics that affect the outcome variable. In our example, the ATU is the effect of advanced classes on the final grades of people that did not attend. + +
+{{}} +ATU = E[\delta_i|D_i=0] = \\ +E[Y_i^1|D_i=0] - E[Y_i^0|D_i=0] +{{}} +
+ +{{% summary %}} + +Causal inference methods are essential for understanding the impact of variables on outcomes. The potential outcomes framework lays the conceptual framework for causal inference. Understanding the role of randomization and selection bias is essential for reliable causal inference. + +- The *Average Treatment Effect* (ATE) represents the average effect of treatment on all units. + +- The *Average Treatment Effect on the Treated* (ATT) focuses on the treated units, while the *Average Treatment Effect for the Untreated* (ATU) focuses on the control group. + +Navigate to the ["Causal Inference" section](https://tilburgsciencehub.com/topics/analyze/causal-inference/#causal-inference-ezo) for applications on the following causal inference methods: + +- [Regression Discontinuity Designs (RDD)](https://tilburgsciencehub.com/topics/analyze/causal-inference/rdd) +- [Difference in Difference](https://tilburgsciencehub.com/topics/analyze/causal-inference/did) +- [Panel Data](https://tilburgsciencehub.com/topics/analyze/causal-inference/panel-data) +- [Instrumental Variables](https://tilburgsciencehub.com/topics/analyze/causal-inference/instrumental-variables) +- [Matching](https://tilburgsciencehub.com/topics/analyze/causal-inference/matching) + +{{% /summary %}} + + +# Reference +- [Cunningham, S. (2021). Causal inference: The mixtape. Yale university press.](https://mixtape.scunning.com/) + + + + + + + diff --git a/content/topics/Analyze/causal-inference/did/images/corr_cause.png b/content/topics/Analyze/causal-inference/introduction/images/corr_cause.png similarity index 100% rename from content/topics/Analyze/causal-inference/did/images/corr_cause.png rename to content/topics/Analyze/causal-inference/introduction/images/corr_cause.png diff --git a/content/topics/Analyze/causal-inference/matching/_index.md b/content/topics/Analyze/causal-inference/matching/_index.md new file mode 100644 index 000000000..7cdef07b7 --- /dev/null +++ b/content/topics/Analyze/causal-inference/matching/_index.md @@ -0,0 +1,6 @@ +--- +draft: false +title: "Matching" +weight: 5 +type: subcategory +--- \ No newline at end of file diff --git a/content/topics/Analyze/causal-inference/matching/approximate-matching.md b/content/topics/Analyze/causal-inference/matching/approximate-matching.md new file mode 100644 index 000000000..e531a6a1b --- /dev/null +++ b/content/topics/Analyze/causal-inference/matching/approximate-matching.md @@ -0,0 +1,250 @@ +--- +title: "Propensity Score Matching: A Method for Approximate Matching" +description: "Explore Approximate and especially Propensity Score Matching, where units are paired based on similar (but not identical) characteristics to create comparable groups in observational studies." +keywords: "matching, propensity, score, causal, inference, effect,regression, R, exact, approximate" +draft: false +weight: 2 +author: "Valerie Vossen" +aliases: + - /approximate-matching + - /propensity-score-matching +--- + +## Overview + +[Here](/matching) we introduced matching where we specifically discussed exact matching, the method in which individuals are paired on one or more *identical* characteristics. + +Now we will continue with approximate matching, where pairs are made on *similar but not identical* characteristics. It is again a way to adjust for differences when treatment is non-random. Important to mention is that this method is only relevant for selection on observables, which means you should be able to identify the confounders (and have data on them!) that are biasing the effect. We focus on Propensity Score Matching specifically and provide a code example in R. + +## Curse of dimensionality + +[Exact matching](/matching) requires an untreated unit with the precise similar value on each of the observed characteristics to be a good counterfactual for the treated units. The curse of dimensionality means that when there is a large number of variables you are matching treated and untreated units, or when some variables are continuous, there might be no appropriate counterfactual left (that has all the same values on all these covariates). This can be an issue in finite samples, where you have too many covariates to match and not enough data. + +Approximate matching offers a solution; it reduces the dimensionality by defining a distance metric on characteristic X and pair units based on this distance rather than the value of X. As the sample gets larger, approximate matching method results will all tend towards exact matching. + + +## Approximate matching methods + +{{% summary %}} + +Common methods of approximate matching include: + +- *Nearest-neighbour matching*: Pairs each treated unit with the "nearest" untreated unit based on defined characteristics. + +- *Kernel matching*: Estimates the counterfactual outcome of the treated unit by averaging outcomes of nearby control units. + +- *Propensity score matching*: Uses the propensity score, representing the probability of treatment assignment given observed covariates. + +The guide of [Glen Waddel](https://pages.uoregon.edu/waddell/metrics/matching.html) is a good resource that discusses these types in more detail. + +{{% /summary %}} + +As the sample size increases, different matching estimators will yield similar results. However, their performance varies in finite samples. The choice depends on your data. [Glen Waddel](https://pages.uoregon.edu/waddell/metrics/matching.html) also gives a broader discussion on this and some points to consider when choosing the method. + + +## Propensity score matching + +Propensity score matching addresses the curse of dimensionality by matching on the "propensity score", representing the probability of being assigned to the treatment group, conditional on the particular covariates $X$: +
+
+{{}} +P(X_{i}) ≡ Pr(D_i = 1 | X_i) +{{}} + +
+ +By controlling for these propensity scores, you compare units that, based solely on their observable characteristics, had similar probabilities of getting the treatment. This mitigates the selection bias; the remaining variation in the outcome variable can then be assigned as being due to the treatment only. + +{{% tip %}} +When conditional independence holds true given $X$, it also holds true when conditioned on the propensity score. This famous result by [Rosenbaum and Robin (1983)](https://academic.oup.com/biomet/article/70/1/41/240879) is useful because while $X$ may be high-dimensional, the propensity score is one-dimensional. Consequently, matching on the propensity score increases the likelihood of finding common support between treatment and control groups, facilitating better and closer matches. +{{% /tip %}} + + +## Practical example of PSM + +### Setting +We will use the second example of [Imbens (2015)](https://jhr.uwpress.org/content/50/2/373.short) to show matching in practice. Find context on the example in part B section 6.2. The authors examine the effect of participation in a job training program on earnings. Specifically, it is the NSW program that has the goal to help disadvantaged workers. + +Studying this question isn't straightforward, because individuals who enroll in the training program can differ from those who don't. To tackle this challenge, they conduct a field experiment where qualified participants are randomly assigned to training positions. The Lalonde dataset is collected from this field experiment. + +However, Imbens (2015) finds evidence of selective attrition in this dataset: individuals are more likely to drop out of the experiment in a manner where the likelihood of dropping out depends on treatment assignment. Hence, we cannot assume treatment holds in our dataset. Nonetheless, we can still utilize the matching estimator instead of the difference-in-means estimator if the rate of attrition remains the same when we control for the observed covariates. + +### Load packages and data + +You can load the data by copying the following code into R. + +{{% codeblock %}} +```R +data_url <- "https://raw.githubusercontent.com/tilburgsciencehub/website/topic/interpret-summary-regression/content/topics/Analyze/causal-inference/matching/nsw_dw.dta" + +load(url(data_url)) # is the cleaned data set + +``` +{{% /codeblock %}} + +Load the following packages: + +{{% codeblock %}} +```R +library(knitr) +library(haven) +library(MatchIt) +``` +{{% /codeblock %}} + + +### Difference-in-means estimate + +The outcome variable is the income of participant $i$ in 1978 (`re78`). The treatment variable is a binary variable that denotes 1 if the participant took part in the training program and 0 if not (`treat`). + +First, we calculate the ATT by using the difference-in-means estimate, without solving the potential bias coming from selective attrition. + +The following R code calculates the mean of `re78` for each treatment group, storing it in a new data frame called `meanre78`. Then, the difference-in-means estimate is calculated by subtracting the mean of the control group from the mean of the treatment group. + +{{% codeblock %}} +```R +meanre78 <- aggregate(re78~treat, data=data, FUN=mean) +estimate_diffmean <- meanre78[2,2] - meanre78[1,2] +estimate_diffmean + +t.test(data$re78[data$treat==1], + data$re78[data$treat==0] + ) + +``` +{{% /codeblock %}} + + +

+ +

+ + +The difference-in-means estimate `1794.342` is returned. Additionally, a two-sample t-test compares the `re78` values between the treatment and control group and tests the null hypothesis that the means of the two groups are equal. The p-value lower than 0.05 (here, `0.007`) indicates the means are statistically significant different from each other. + + +### Balance test + +To check whether randomization went right and whether the treatment and control groups are balanced, we compare the observed covariates with a summary statistics table. The covariates of interest are defined by the `columns_of_interest` list. Then, the mean and standard deviation of each variable is calculated for both the treatment and control group, and t-tests are conducted to compare these means. The results are extracted and stored in vectors. + +{{% codeblock %}} +```R +# Generate binary variable for unemployment +data$u74 = data$re74<0.0001 +data$u75 = data$re75<0.0001 + +# Defining the covariates (columns) of interest +columns_of_interest = c("black","hispanic","age","married","nodegree","education","re74","re75") + +# Calculate means and standard deviations +treatment_means = colMeans(data[data$treat==1,columns_of_interest]) +control_means = colMeans(data[data$treat==0,columns_of_interest]) +treatment_std = colSd(data[data$treat==1,columns_of_interest]) +control_std = colSd(data[data$treat==0,columns_of_interest]) + +# Perform t-tests and extract results +t_test_results = sapply(columns_of_interest, function(x) { + t.test(data[[x]][data$treat==1],data[[x]][data$treat==0])$statistic +}) + +t_test_pvalue = sapply(columns_of_interest, function(x) { + t.test(data[[x]][data$treat==1],data[[x]][data$treat==0])$p.value +}) + +# Create summary table +df <- data.frame( + Covariate = columns_of_interest, + mean_control = round(control_means, digits = 2), + sd_control = round(control_std, digits = 2), + mean_treated = round(treatment_means, digits = 2), + sd_treated = round(treatment_std, digits = 2), + t_stat = round(t_test_results, digits = 1), + p_value = round(t_test_pvalue, digits = 3) +) + +kable(df) + +``` +{{% /codeblock %}} + +

+ +

+ +The first two columns present the results of the control group, and the third and fourth are the results of the treatment group. +Significant differences in some characteristics (p-value < `0.05`) suggest an imbalance between the two groups. For example, since people with no degree are more likely to drop out of the treatment group, the amount of people with `nodegree` is significantly higher in the control group. If people with `nodegree` have lower earnings independent of whether they received a program, the treatment effect is likely to be overestimated. Matching offers a solution, where we can control for this kind of covariates. + +### Propensity score matching + +To avoid the curse of dimensionality, since we have multiple observed covariates and relatively few observations, we use propensity score matching. + +1. Calculate the propensity score + +Calculate the propensity score by estimating a logistic regression with the `glm()` function. The dependent variable is the treatment indicator, the independent variables include the covariates and interaction terms. + +{{% codeblock %}} +```R +# Create interaction terms +data$re74nodegree = data$re74*data$nodegree +data$nodegreeeducation = data$nodegree*data$education +data$u75education = data$education*data$u75 + +# Run the logistic regression +propreg = glm(treat ~ re74 + u74 + re75 + u75 + nodegree + hispanic + education + nodegreeeducation + re74nodegree + u75education, family = binomial(link = "logit"),data=data) +summary(propreg) + +``` +{{% /codeblock %}} + +

+ +

+ +This logistic regression model estimates the propensity score, which is the probability of receiving the treatment given the observed covariates $\hat{P}(X_i)$. + + +### Nearest neighbor propensity score matching + +We now use the propensity scores to conduct nearest neighbor matching, ensuring balance between treatment and control groups. The `matchit()` function is used, in which the formula, the method (here, `nearest`) and the distance metric are specified. The distance metric is used to measure the similarity between individuals; it is calculated as the log odds ratio of the propensity score. + +{{% codeblock %}} +```R +# Generate the matched data +match_prop <- matchit(treat ~ re74 +u74 + re75 + u75 + nodegree + hispanic + education + nodegreeeducation + re74nodegree + u75education, method = "nearest", distance = "glm", link = "logit", data=data ) + +# Data containing treated observations and their matched equivalence in the control group +data_match_prop <- match.data(match_prop) +``` +{{% /codeblock %}} + +Estimate the ATT by comparing the means of the outcome variable (`re78`) between the treated and control groups in the matched data. + +{{% codeblock %}} +```R +# Estimate the ATT +mean_re78_match_prop <- aggregate(re78~treat, data=data_match_prop, FUN=mean) + +estimate_matching <- mean_re78_match_prop$re78[2] - mean_re78_match_prop$re78[1] + +estimate_matching + +# Another t-test to check if means are statistically different +t.test(data_match_prop$re78[data_match_prop$treat==1],data_match_prop$re78[data_match_prop$treat==0]) + +``` +{{% /codeblock %}} + +

+ +

+ +The difference between income is `1504`, which is lower than the estimated ATT before matching. The p-value of `0.03` confirms the means are statistically different from each other. +This is in line with our reasoning: the treatment effect was underestimated when ignoring that on average the control group contained more people with `nodegree`, as a consequence of selective attrition. + +{{% summary %}} + +Approximate matching, specifically Propensity Score Matching (PSM), addresses selection bias by matching individuals based on their propensity scores, representing the probability of receiving the treatment given observed covariates. This method helps mitigate the curse of dimensionality, where exact matching becomes impractical due to a large number of variables or continuous covariates. + +Using a practical example from Imbens (2015) on the effect of job training programs on earnings, we demonstrated the application of PSM in R. By estimating propensity scores and conducting nearest-neighbor matching, we were able to assess the Average Treatment Effect on the Treated (ATT) and address potential bias from selective attrition, where people with no degree where more likely to drop out of the treatment group. + +{{% /summary %}} diff --git a/content/topics/Analyze/causal-inference/matching/images/effectcoding_summary.png b/content/topics/Analyze/causal-inference/matching/images/effectcoding_summary.png new file mode 100644 index 000000000..529de363e Binary files /dev/null and b/content/topics/Analyze/causal-inference/matching/images/effectcoding_summary.png differ diff --git a/content/topics/Analyze/causal-inference/matching/images/histogram_age.png b/content/topics/Analyze/causal-inference/matching/images/histogram_age.png new file mode 100644 index 000000000..52713c610 Binary files /dev/null and b/content/topics/Analyze/causal-inference/matching/images/histogram_age.png differ diff --git a/content/topics/Analyze/causal-inference/matching/images/psm_balancetest.png b/content/topics/Analyze/causal-inference/matching/images/psm_balancetest.png new file mode 100644 index 000000000..b2f380b83 Binary files /dev/null and b/content/topics/Analyze/causal-inference/matching/images/psm_balancetest.png differ diff --git a/content/topics/Analyze/causal-inference/matching/images/psm_ttest.png b/content/topics/Analyze/causal-inference/matching/images/psm_ttest.png new file mode 100644 index 000000000..06dde1d7f Binary files /dev/null and b/content/topics/Analyze/causal-inference/matching/images/psm_ttest.png differ diff --git a/content/topics/Analyze/causal-inference/matching/images/psm_ttest2.png b/content/topics/Analyze/causal-inference/matching/images/psm_ttest2.png new file mode 100644 index 000000000..b36020956 Binary files /dev/null and b/content/topics/Analyze/causal-inference/matching/images/psm_ttest2.png differ diff --git a/content/topics/Analyze/causal-inference/matching/images/summary_propreg.png b/content/topics/Analyze/causal-inference/matching/images/summary_propreg.png new file mode 100644 index 000000000..ae1920179 Binary files /dev/null and b/content/topics/Analyze/causal-inference/matching/images/summary_propreg.png differ diff --git a/content/topics/Analyze/causal-inference/matching/matching.md b/content/topics/Analyze/causal-inference/matching/matching.md new file mode 100644 index 000000000..9f2b06b35 --- /dev/null +++ b/content/topics/Analyze/causal-inference/matching/matching.md @@ -0,0 +1,307 @@ +--- +title: "An Introduction to (Exact) Matching" +description: "Matching is used to create comparable groups in observational studies, helping to mitigate the effects of confounding variables and estimate causal effects." +keywords: "matching, causal, inference, effect,regression, R, exact, approximate" +draft: false +weight: 1 +author: "Valerie Vossen" +aliases: + - /exact-matching + - /matching +--- + +## Overview + +Randomization is a fundamental principle in experimental design, aiming to have a good counterfactual and ensuring the treatment and control groups are similar except for being treated. However, in cases where treatment is not randomly assigned, confounding variables can bias the estimated causal effect. + +Matching offers an alternative approach by basically creating an artificial counterfactual. This is done by pairing individuals or units based on specific observable characteristics to create comparable groups. This article provides a comprehensive introduction to matching, focusing on both the theory behind the method and practical applications. + +## Exact and approximate matching + +Exact matching involves pairing individuals who share **identical characteristics**. This requires the observable characteristic on which pairing happens to be a binary variable. Also ideally, the control group has many observations at each distinct value of the binary variable the observations are matched on. + +In contrast, [approximate matching](/approximate-matching) allows for some degree of flexibility and pairs on similar but not identical characteristics. + + +## Identifying assumptions + +{{% summary %}} + +Two identifying assumptions should hold for exact matching to be valid: + +1. *Conditional Independence Assumption:* Given observed covariates *X*, there are no systematic differences between treated and untreated units. +
+
+ +2. *Overlapping support*: Conditional on *X*, there are treated and untreated units. + +{{% /summary %}} + + +### 1. Conditional Independence Assumption (CIA) + +The Conditional Independence Assumption is the primary assumption underlying matching. It states that **once we account for the observed characteristics of units or individuals, there should be no systematic differences in potential outcomes between the treatment and control groups**. + +Mathematically, this assumption is expressed as follows: + +{{}} +(Y_{0}, Y_{1}) \perp T \:|\: X +{{}} + +
+
+ +Where $Y_{0}$ and $Y_{1}$ are the potential outcomes in respectively the control and treatment group, which are independent of treatment assignment $T$ for each value of the observed covariates $X$. The $X$ is/are the binary variable(s) on which the observations are matched. + +Important to note is that this assumption also implies no *unobservable* characteristics differing between the groups, impacting treatment likelihood or effects. When dividing the sample by $X$, for each value of $X$, the variation in $D$ is akin to random. In some cases, this condition is know to hold. A notable example is "Project STAR", where treatment was randoml assigned within schools but with varying likelihoods across different schools. Hence, school identity is included in $X$. + +{{% tip %}} +While the absence of no unobservable characteristics cannot be directly tested, there are some methods to assess the plausibility of the assumption. + +As discussed in [Imbens (2015)](https://jhr.uwpress.org/content/50/2/373.short), one method is to calculate the causal effect of treatment on a pseudo-outcome that you know is unaffected by it, for example, a lagged outcome. If the treatment effect on the pseudo-outcome is close to zero, it strengthens the assumption's plausibility. [Section VG (page 395)](https://jhr.uwpress.org/content/50/2/373.short) discusses this method in more detail. +{{% /tip %}} + + +### 2. Overlapping support + +The assumption of overlapping support states that, conditional on observed covariates $X$, there exist both treated and untreated units across the entire range of +$X$. In other words, there should be enough overlap in the distribution between the treatment and control groups to enable meaningful comparisons. This ensures that the treatment and control groups are comparable across all observed covariates, allowing for valid causal inference. + + +## Exact matching estimator + +If the above two assumptions are valid, the following identity follows: + +{{}} +E[Y_1 - Y_0 | X] = \\ + +E[Y_1 - Y_0 | X, D = 1] - E[Y_0 | X, D = 0] = \\ + +E[Y | X, D=1] - E[Y | X, D=0] +{{}} + +
+
+ +Where: +- $Y_1$ is the outcome for the treatment group +- $Y_0$ is the outcome for the control group +- $D$ is the treatment indicator (1 if treated, 0 if control) + +The average effect of the treatment on the treated (ATT) is estimated by taking the expected outcome of the treatment group minus the expected outcome of the matched controls, averaged out over the treatment group. + +The estimated average treatment effect on the treated ($\hat{d}_{\text{ATT}}$) is expressed as follows: + +{{}} +\hat{d}_{\text{ATT}} = \frac{1}{N_1} \sum_{D = 1} (Y_i - Y_{j(i)}) +{{}} +
+
+ +Where: +- $N_{1}$ is the total number of units in the treatment group +- $Y_{i}$ is the outcome variable of the treated unit $i$ +- $Y_{j(i)}$ is the outcome variable of the control unit matched to unit $i$ + +{{% tip %}} +The counterfactual is the mean outcome in the control group for observations with the exact same characteristics. +{{% /tip %}} + + +## Practical example + +The most simple practical example of exact matching is given, with data simulated for this purpose. Say we are interested in finding the effect of a graduate traineeship program on earnings. We have data of 100 employees, of which 50 completed a traineeship at the start of their career (the *treatment group*) and 50 did not (the *control group*). + + +First, load the necessary packages, and the data set: + +{{% codeblock %}} +```R +library(MatchIt) +library(ggplot2) +library(dplyr) + +# Load data +data_url <- "https://raw.githubusercontent.com/tilburgsciencehub/website/master/content/topics/Analyze/causal-inference/matching/jobtraining_data.Rda" + +data <- load(url(data_url)) + +View(data) +``` +{{% /codeblock %}} + +When observing the data, we notice the groups are not similar. The people in the treatment group, who followed the traineeship are on average younger (26 years) than the people in the control group (39 years): + +{{% codeblock %}} +```R +# Average age for the treatment group +mean(data$Age[data$Treatment == 1]) + +# Average age for the control group +mean(data$Age[data$Treatment == 0]) +``` +{{% /codeblock %}} + + +The following histogram confirms this unequal distribution: + +{{% codeblock %}} +```R +# Create a combined data frame for treatment and control groups +data$Group <- ifelse(data$Treatment == 1, "Treatment", "Control") + +# Create the histogram +ggplot(data, aes(x = Age, fill = Group)) + + geom_histogram(position = "identity", + alpha = 0.5, + bin_width = 1) + + labs(title = "Age Distribution: Treatment vs. Control Group", + x = "Age", + y = "Frequency") + + scale_fill_manual(values = c("blue", "red") + ) + +``` +{{% /codeblock %}} + +

+ +

+ +If younger people have lower earnings on average (which is likely to be the case), the effect of the traineeship is underestimated due to this unequal distribution of treatment assignment. First, we calculate the treatment effect while ignoring this potential bias: + +{{% codeblock %}} +```R +ATT_original <- mean(data$Earnings[data$Treatment == 1] - mean(data$Earnings[data$Treatment == 0])) + +print(ATT_original) + +``` +{{% /codeblock %}} + +The ATT is `-998.26`. So, not taking the average age difference into account, you find a negative effect of the traineeship program on earnings. The ATT is biased. + +A solution is to match a treated employee to an untreated employee on age and compare their earnings instead. You can do this with the `matchit` function from the `MatchIt` in R, to match observations, specifying `Treatment` and `Age`. + +{{% codeblock %}} +```R +matched_data <- matchit(Treatment ~ Age, data = data, method = "exact") + +# Extract the matched dataset +matched_data <- match.data(matched_data) + +``` +{{% /codeblock %}} + +Calculate the ATT, but now on the matched sample: + +{{% codeblock %}} +```R +ATT_matching <- mean(matched_data$Earnings[matched_data$Treatment == 1] - matched_data$Earnings[matched_data$Treatment == 0]) + +print(ATT_matching) + +``` +{{% /codeblock %}} + +The ATT is `7968.828`, suggesting a positive effect of the traineeship program on earnings, which makes more sense! + +## OLS as a matching estimator + +Another approach to control for the effects of other variables is to use Ordinary Least Squares (OLS) regression as a matching estimator. Regress the outcome variable ($Y$) on the treatment indicator ($D$), and the covariates ($X$). + +furthermore, to understand how the treatment effect depends on observable characteristics $X$, we can include interaction terms between $D$ and $X$ in the regression model: + +{{}} +Y = \beta_0 + \beta_1 D + \beta_2 * X + \beta_3 (D * X) + \epsilon_i +{{}} + +Where +- $Y$ is the outcome variable +- $D$ is the treatment indicator (1 = treated, 0 = control) +- $X$ is a vector of covariates +- $D * X$ is the interaction effect between the treatment and covariate(s) *X*. + +### Effect coding + +Effect coding is another approach to include these interaction terms, allowing for easier interpretation. It involves coding categorical variables such that the coefficients represent deviations from the overall mean. It can help understand how the treatment effect varies across different levels of X. + +The regression equation with interaction terms included would look like this: + +{{}} +Y = \beta_0 + \beta_1 D + \beta_2 * X + \beta_3 D * (X_i - \bar{X}) + \epsilon_i +{{}} + +
+
+ +Where $D * (X_i - \bar{X})$ is the interaction between the treatment and the de-meaned covariate $X_{i} - \bar{X}$. +
+It captures how the treatment effect varies with deviations of the covariate $X$ from its mean value ($\bar{X}$). Specifically, $\beta_3$ indicates the additional effect of the treatment for each unit change in the covariate(s), compared to the average treatment effect. + +When $X_i = \bar{X}$, the expected differences in outcomes is $\beta_2$. This is the ATE under conditional independence. + +The ATT can be estimated as: +
+
+{{}} +\hat{\beta}_2 + \frac{1}{N_1} \sum_{i=1}^{N_1} D_i \cdot (X_i - \bar{X})' \hat{\beta}_3 +{{}} + + +The following R code creates a de-meaned variable for `Age` and runs the OLS regression with an interaction term between `Treatment` and `Age_demeaned`. + +{{% codeblock %}} +```R +# Create a de-meaned covariate +data$Age_demeaned = data$Age - mean(data$Age) + +# OLS regression with interaction term +ols_model <- lm(Earnings ~ Treatment * Age_demeaned, data = data) + +summary(ols_model) + +``` +{{% /codeblock %}} + +

+ +

+ +The coefficient for treatment is `351.4`, indicating a positive but statistically insignificant effect of the program on earnings. + +The estimate for the interaction term measures the dependence of the treatment effect on covariate `Age`. While insignificant, the negative sign indicates the effect of the program on earnings is reduced for individuals with an age that is further away from the mean. + +{{% tip %}} + +For a full interpretation of the summary output of the regression model, refer to [this topic](/regressionoutput). + +{{% /tip %}} + +## OLS versus Matching method + +While using OLS regression and adding covariates for each observable characteristic, and the Matching method both rely on the Conditional Independence Assumption to facilitate causal inference, opting for matching has its advantages. Reasons to consider matching instead of the OLS method are outlined in the table below: + + +| | Matching Method | OLS Method | +|-------------------------------------|-------------------------|-----------------------------| +| Functional
form | Linear functional
form not required | Assumes linear
functional form | +| Comparable
untreated units | Identifies whether there
are comparable untreated
units available for each treated
unit. | Does not identify
whether there is a lack of
comparable untreated
units for each treated
unit. | +| Counterfactual
weighting | Expected counterfactual
for each treated unit
weighted based on
observable characteristics
of the untreated units. | Uses whole control
group for determining the
expected counterfactual. | + + +{{% summary %}} + +The identifying assumptions *Conditional Independence* and *Overlapping Support* are crucial. When these assumptions hold, matching provides a framework for establishing causal relationships in observational studies. An example of exact matching is given, where individuals were paired based on the identical values of their age. + +In OLS regression, incorporating an interaction term between the treatment indicator and the (de-meaned) covariate allows for assessing how the treatment effect varies across different levels of the covariate. + +You can continue the content of matching by reading the [next topic on Approximate Matching](\approximate-matching). + +{{% /summary %}} + + + + + + diff --git a/content/topics/Analyze/causal-inference/matching/nsw_dw.dta b/content/topics/Analyze/causal-inference/matching/nsw_dw.dta new file mode 100644 index 000000000..9c0a3d5de Binary files /dev/null and b/content/topics/Analyze/causal-inference/matching/nsw_dw.dta differ diff --git a/content/topics/Analyze/causal-inference/rdd/_index.md b/content/topics/Analyze/causal-inference/rdd/_index.md index 8d0347849..431826ed0 100644 --- a/content/topics/Analyze/causal-inference/rdd/_index.md +++ b/content/topics/Analyze/causal-inference/rdd/_index.md @@ -1,6 +1,6 @@ --- draft: false title: "Regression Discontinuity Designs (RDD)" -weight: 1 +weight: 2 type: subcategory --- \ No newline at end of file diff --git a/content/topics/Analyze/machine-learning/ml-intro/images/artificial-neuron-representation-math.png b/content/topics/Analyze/machine-learning/ml-intro/images/artificial-neuron-representation-math.png new file mode 100644 index 000000000..13a875a69 Binary files /dev/null and b/content/topics/Analyze/machine-learning/ml-intro/images/artificial-neuron-representation-math.png differ diff --git a/content/topics/Analyze/machine-learning/ml-intro/images/deep-neural-network-graphics.png b/content/topics/Analyze/machine-learning/ml-intro/images/deep-neural-network-graphics.png new file mode 100644 index 000000000..2c94969f2 Binary files /dev/null and b/content/topics/Analyze/machine-learning/ml-intro/images/deep-neural-network-graphics.png differ diff --git a/content/topics/Analyze/machine-learning/ml-intro/images/neuron-anatomy.png b/content/topics/Analyze/machine-learning/ml-intro/images/neuron-anatomy.png new file mode 100644 index 000000000..851370caa Binary files /dev/null and b/content/topics/Analyze/machine-learning/ml-intro/images/neuron-anatomy.png differ diff --git a/content/topics/Analyze/machine-learning/ml-intro/introduction-to-deep-learning.md b/content/topics/Analyze/machine-learning/ml-intro/introduction-to-deep-learning.md new file mode 100644 index 000000000..4679089cc --- /dev/null +++ b/content/topics/Analyze/machine-learning/ml-intro/introduction-to-deep-learning.md @@ -0,0 +1,231 @@ +--- +title: "Introduction to Deep Learning" +description: "Broaden your knowledge about machine learning with this introduction to deep learning." +keywords: "machine learning, deep learning, neural networks, model, python" +weight: 3 +date: 2024-02-18T22:02:51+05:30 +draft: false +aliases: + - /learn/machine-learning +--- +# What is Deep Learning? + +Deep learning is a type of machine learning that's inspired by the structure and function of the human brain, particularly in how it processes information and learns from it. It's called "deep" learning because it involves neural networks with many layers—these are the "deep" part. Each layer of these networks learns to transform the data in a way that makes it more useful for achieving the network's overall goal. + +Deep learning is commonly used in image recognition, natural language processing, and speech recognition tasks. It finds applications in industries such as healthcare, finance, and autonomous vehicles, where it enables tasks like medical diagnosis, fraud detection, and autonomous driving. + +## Neural Networks: Inspired by the Brain + +Deep learning draws inspiration from the human brain's complex structure. Geoffrey Hinton's pioneering research asked: Can computer algorithms mimic brain neurons? This exploration aimed to harness the brain's power by mirroring its architecture. The anatomy of a single neuron is presented below. + +

+ +

+Picture source: McCullum, N. (2021, 28 april). Deep Learning Neural networks explained in Plain English + +
+ +Neurons, the brain's building blocks, work in networks, where their combined activity creates meaning. Neurons receive and send signals through dendrites and axons. In computer models, this is replicated through weighted inputs and activation functions. + +### Mimicking Neurons in Computers + +In deep learning, neurons gather inputs, compute weighted sums, and pass them through activation functions. Weights, vital for model training, adjust to optimize performance, forming the core of deep neural network training. + +

+ +

+Picture source: McCullum, N. (2021, 28 April). Deep Learning Neural Networks Explained in Plain English. + +# Shallow and deep neural networks + +Neural networks could be distinguished between shallow and deep neural networks. What is the difference? + +Single-layer neural networks, also referred to as shallow neural networks, comprise a single hidden layer positioned between the input and output layers. This hidden layer conducts computations, orchestrating the transformation of input data to yield the intended output. Shallow neural networks, though straightforward, possess restricted capacity for learning intricate patterns and representations. + +In contrast, deep neural networks boast multiple hidden layers distributed between the input and output layers. These networks can be profoundly deep, incorporating tens or even hundreds of layers. Each layer undertakes computations, forwarding the modified data to the subsequent layer in succession. + +## Components of deep learning network +In the sections above we introduced concept of deep learning and we talked a little bit about biology behind it. Let's move now to explaining separate components of deep learning architecture. + +- **Input layer** refers to the first layer of nodes in an artificial neural network. This layer receives input data from the outside world. + +- **Hidden layers** are what make neural networks "deep" and enable them to learn complex data representations. + +- **Output layer** is the final layer in the neural network where desired predictions are obtained. There is one output layer in a neural network that produces the desired final prediction. + +## How information is transferred between the layers? + +In neural networks, information is transferred between layers through weighted connections. Each neuron in a layer receives input from neurons in the previous layer, multiplies these inputs by corresponding weights, sums them up, and applies an activation function to produce an output. This output becomes the input for neurons in the next layer, and the process repeats for each layer until reaching the output layer. + +## Loss function +The loss function in deep learning measures how well the model's predictions match the actual targets in the training data. It quantifies the difference between predicted and true values, providing feedback for adjusting the model's parameters during training. The goal is to minimize the loss function, indicating better alignment between predictions and actual outcomes, ultimately improving the model's performance. + +Mathematically speaking the loss function is the chosen family of mathematical equations which returns a scalar that is smaller when the model maps inputs to outputs better. + +$L[\theta, f[x, \theta],$ +{{}} +\{ x_{i}, y_{i} \}^{I}_{i=1} +{{}} +$]$ + + +where: + +- $ f[x, \theta] $ is a deep learning model + +- {{}} \{ x_{i}, y_{i} \}^{I}_{i=1} {{}} is a training data + +## Training process +Let us denote our training dataset of I pairs of input/output examples: + +$\text{Training Data: }$ {{}}\{ x_{i}, y_{i} \}^{I}_{i=1}{{}} + +Training consists in finding the model's parameters to minimize the loss function, i.e: + +$\hat{\theta} = argmin[L[\theta, f[x, \theta], \{ [x_{i}, y_{i}] \}^{I}_{i=1}]]$ + +A single prediction $\hat{y}$ can be denoted in the following way: + +$\hat{y} = f[x, \hat{\theta}]$ + +## Activation function +Activation functions are essential components of neural networks as they introduce nonlinearity, a crucial element for enabling the development of intricate representations and functions. This nonlinearity expands the network's capacity to capture complex relationships and patterns from the inputs, surpassing the capabilities of a basic linear regression model. + +Multiple options are available for the choice of the activation function: + +- Sigmoid function (only used in the output layer of the logistic regression): +$ a(z) = \frac{\mathrm{1} }{\mathrm{1} + e^-{z} } $ + +- Tanh function: +$ a(z) = \frac{e^{z} - e^{-z}}{e^{z} + e^{-z}} $ + +- ReLU function: +$ a(z) = \max\(0,z)$ + +## Optimizer +An optimizer in the training process of a neural network is like a guide that helps the network learn from the data more effectively. Its main job is to adjust the model’s parameters (like weights and biases) so that the predictions get closer to the actual targets over time. This adjustment is informed by the gradients of the loss function, which indicate the direction of steepest ascent or descent in the parameter space. By iteratively updating the parameters based on these gradients, the optimizer guides the network towards minimizing the loss and improving its predictive accuracy. + +## Mathematical representation +After introducing all necessary concepts, lets take a look how layers are represented mathematically: +First let's consider very small neural network which has only 2 hidden layers: + + $\small \text{Inputs: }$ {{}}\{ x_{i} \}^{I}_{i=1}{{}} + +$ \small \text{$1st$ hidden layer: } h_{0} = a[\beta_{0} + \Omega_{0}x] $ + +$ \small \text{$2nd$ hidden layer: } h_{1} = a[\beta_{1} + \Omega_{1}h1]$ + +$ \small \text{Output layer: } y = \beta_{2} + \Omega_{2}h_{2} $ + +This small neural network, mathematically would be denoted in the following way: + +
$y = \beta_{2} + \Omega_{2}a[\beta_{1} + \Omega_{1}a[\beta_{0} + \Omega_{0}x]]$

+ +Moving now to more general formula with k number of hidden layers. + +$ \small \text{Inputs: }$ {{}}\{ x_{i} \}^{I}_{i=1}{{}} + +$ \small \text{$kth$ hidden layer: } h_{k} = a[\beta_{k-1} + \Omega_{k-1}h_{k-1}] $ + +$ \small \text{Output layer: } y = \beta_{k} + \Omega_{k}h_{k} $ + +where: +- $a$ is an activation function +- $\beta$ is bias vector (biases: additional parameters added to the weighted sum before applying the activation function) +- $\Omega$ is a weight matrix (grid-like structure containing numerical values. Each value in the matrix represents the strength of a connection between neurons in one layer and neurons in the next layer of a neural network.) + +Finally the general equation of the deep neural network could be annotated as following: + +
$y = \beta_{k} th+ \Omega_{k}a[\beta_{k-1} + \Omega_{k-1}a[...\beta_{2} + \Omega_{2}a[...\beta_{1} + \Omega_{1}]]...]]$
+ +### Graphical represantion of a deep neural network: +This is how deep neural network could be presented in the graph: +

+ +

+ +## Coding deep neural network + +Coding deep neural networks involves using programming languages such as Python, which is highly popular due to its simplicity and extensive libraries for machine learning and neural networks. Libraries like TensorFlow and PyTorch are widely used for building and training deep neural networks, providing high-level APIs for constructing complex architectures with ease. + +Building and training deep neural networks involves several steps, including loading and preparing the data, constructing the model architecture, and compiling, fitting, and evaluating the model. In this section, we will break down each of these steps in detail. + +### Loading and Preparing the Data + +In this section, we load the MNIST dataset and preprocess it for training and testing. + +We begin by importing necessary libraries for data manipulation and preprocessing. Then, we load the MNIST dataset using Keras' built-in function `mnist.load_data()`. The dataset consists of 60,000 training images and 10,000 test images, each with their corresponding labels. Next, we preprocess the data by reshaping the input images and normalizing pixel values to a range between 0 and 1. Additionally, we convert the class labels to categorical format using one-hot encoding. + +```python +import numpy as np +from keras.datasets import mnist +from keras.utils import to_categorical + +# Load the MNIST dataset +(X_train, y_train), (X_test, y_test) = mnist.load_data() + +# Preprocess the data +X_train = X_train.reshape((X_train.shape[0], -1)).astype('float32') / 255 +X_test = X_test.reshape((X_test.shape[0], -1)).astype('float32') / 255 +y_train = to_categorical(y_train) +y_test = to_categorical(y_test) + +``` +### Building the model +In this section, we construct the neural network model using Keras' Sequential API. + +We create a Sequential model, which allows us to build a linear stack of layers. The model consists of dense (fully connected) layers, which are interconnected neurons. Each dense layer performs a linear operation on the input data followed by a non-linear activation function. In our model, we use ReLU (Rectified Linear Activation) as the activation function for hidden layers and softmax for the output layer. The `units` parameter specifies the number of neurons in each layer, and `input_shape` defines the shape of the input data for the first layer. + +``` +from keras.models import Sequential +from keras.layers import Dense + +# Creating a Sequential model +model = Sequential() + +# Adding layers to the model +model.add(Dense(units=64, activation='relu', input_shape=(X_train.shape[1],))) +model.add(Dense(units=64, activation='relu')) +model.add(Dense(units=10, activation='softmax')) +``` + +### Compiling, fitting and evaluating the model +In this section, we compile the model with an optimizer, loss function, and evaluation metric, train the model on the training data, and evaluate its performance on the test data. + +After building the model, we compile it using the `compile()` method. Here, we specify the optimizer (Adam), loss function (categorical cross-entropy), and evaluation metric (accuracy). Then, we train the model on the training data using the `fit()` method, specifying the number of epochs (iterations over the entire dataset) and batch size (number of samples per gradient update). Finally, we evaluate the trained model on the test data to measure its performance in terms of loss and accuracy. + +``` +# Compiling the model +model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) + +# Training the model +model.fit(X_train, y_train, epochs=10, batch_size=128, verbose=1, validation_split=0.2) + +# Evaluating the model +loss, accuracy = model.evaluate(X_test, y_test, verbose=0) +print("Test Loss:", loss) +print("Test Accuracy:", accuracy) + +``` + +{{% summary %}} +### What is Deep Learning? +Deep learning is a branch of machine learning inspired by the structure and function of the human brain. It involves the use of neural networks with multiple layers to process and learn from data. The term "deep" refers to the depth of these networks, which enable them to extract complex patterns and representations from the data. + +### Mimicking the Behavior of Neurons in Computer Algorithms** +Deep learning aims to mimic the behavior of neurons in the human brain through computer algorithms. Researchers study the collaborative nature of neurons and replicate it in artificial neural networks. This involves computing the weighted sum of inputs, applying activation functions, and transmitting signals between layers to process information. + +### Architecture of Deep Neural Networks +Deep neural networks consist of multiple layers, including input, hidden, and output layers. Information flows through the network via weighted connections between neurons in adjacent layers. The architecture allows for the hierarchical extraction of features from raw data, with each layer learning increasingly abstract representations. + +### Components of deep neural networks +The training process involves optimizing the model's parameters to minimize a loss function, which measures the disparity between predicted and actual values. Activation functions introduce nonlinearity to the network, enabling it to capture complex patterns. Practical implementation also includes data preprocessing, model compilation, and evaluation using libraries like TensorFlow and Keras in Python. + +### Practical Implementation in Python +Python is a popular choice for practical implementation of deep learning models due to its simplicity and extensive libraries. TensorFlow and Keras are commonly used libraries for building and training deep neural networks. Practical implementation involves data preprocessing, defining the model architecture, compiling the model, training it on the data, and evaluating its performance. +{{% /summary %}} + +### References: +Prof. Matteo Bustreo, 2024 “Lesson2-Foundations of Deep Learning" https://tilburguniversity.instructure.com/courses/14887/files/2934299?module_item_id=669643. + +McCullum, N. (2021, 28 april). Deep Learning Neural networks explained in Plain English. freeCodeCamp.org. https://www.freecodecamp.org/news/deep-learning-neural-networks-explained-in-plain-english/ diff --git a/content/topics/Analyze/machine-learning/ml-intro/supervised-learning-101.md b/content/topics/Analyze/machine-learning/ml-intro/supervised-learning-101.md index 9c8a38cc0..2a3602b70 100644 --- a/content/topics/Analyze/machine-learning/ml-intro/supervised-learning-101.md +++ b/content/topics/Analyze/machine-learning/ml-intro/supervised-learning-101.md @@ -25,8 +25,7 @@ The article covers several topics: ## What is Supervised Learning? -Supervised Learning involves training a model on a labeled dataset, which means that each input variable in the training sample is paired with an answer key. -The model learns by comparing its own predicted output with the true answers that are given, and adjusts itself to minimize errors. +Supervised Learning involves training a model on a labeled dataset, which means that each observation (e.g., customer) in the training sample is "tagged" with a particular outcome (e.g., "buy"). The model learns by comparing its own predicted output with the true labels that are given, and adjusts itself to minimize errors. By doing this, it improves its prediction abilities over time, with the goal of accurately generalizing previously unknown data. There are two types of algorithms: @@ -36,7 +35,7 @@ There are two types of algorithms: With regression, the model predicts continuous or numerical values such as the weather, housing prices, or profit. The goal is to estimate the relationship between independent variables (features) and the dependent variable (target). -Classification, on the other hand, is used for categorical output variables. It assigns data into two or more classes, such as customer churn, identifying emails as spam or not, or recognizing handwritten text. The goal is to correctly classify new input data in their corresponding categories. +Classification, on the other hand, is used for categorical output variables. It assigns data into two or more classes, such as customer retention or churn, identifying emails as spam or not, or recognizing handwritten text. The goal is to correctly classify new input data in their corresponding categories. ### Key Concepts @@ -341,4 +340,4 @@ The key takeaways from this article include: ## Additional Resources * Want to learn more on how to implement these methods and others? Check out [this](https://www.datacamp.com/blog/supervised-machine-learning) article and [this](https://app.datacamp.com/learn/skill-tracks/supervised-machine-learning-in-r) track on Datacamp if you are using R. -* Do you prefer learning how to do supervised learning in Python? Check out [this](https://app.datacamp.com/learn/skill-tracks/supervised-machine-learning-in-python) skill track on Datacamp. \ No newline at end of file +* Do you prefer learning how to do supervised learning in Python? Check out [this](https://app.datacamp.com/learn/skill-tracks/supervised-machine-learning-in-python) skill track on Datacamp. diff --git a/content/topics/Analyze/machine-learning/ml_objective_functions.md b/content/topics/Analyze/machine-learning/ml_objective_functions.md new file mode 100644 index 000000000..69f79b638 --- /dev/null +++ b/content/topics/Analyze/machine-learning/ml_objective_functions.md @@ -0,0 +1,237 @@ +--- +title: "Loss Functions and Cost functions in Machine Learning" +description: "Different loss and cost functions for regression and classification tasks." +keywords: "optimization, cost function, loss function, Machine learning, squared error, regression, classification, learning rate, gradient descent" +date: +weight: 5 +author: "Kheiry Sohooli" +authorlink: "https://tilburgsciencehub.com/contributors/kheirysohooli" + +--- + + +## Overview +The notion of a **loss function** is foundational for the training machine learning models. +It provides a quantitative measure of how well or poorly a model is performing. +Different kinds of machine learning tasks (classification or regression) and datasets may require specific loss functions to achieve optimal results. +Knowledge of the loss function is essential to minimize errors and enhance predictive accuracy. + +In this article, we'll discuss the common cost functions used in both regression and classification problems. This knowledge will help you to select the most suitable cost function for your machine learning tasks. + +## The Loss Function vs the Cost Function! +These two terms are occasionally used interchangeably, but they are refer to different things. The loss function is the variance between the actual and predicted values for an individual entry in the dataset. +The cost function is the average of the loss function across the entire dataset. The cost function is use in optimizing the best function for the training data. + +## Cost function in regression problems + +Imagine fitting a simple line (Equation 1) to a dataset with a numeric target using two parameters, $\theta_0$ and $\theta_1$. The aim is to pick the right values for $\theta_0$ and $\theta_1$ so that the gap between the predicted value $h_\theta(x)$ and the actual value $y$ is minimum. This task needs to solve a minimization problem, where you minimize the cost function. You've got various choices for cost functions, that you'll be familiar with, in following sections. + +
+{{}} +h_\theta(x) = \theta_1x + \theta_0 +{{}} + :Equation1 +
+ +

+ +

+ +$h_\theta(x)$ is going to be objective function for linear regression. + +### Mean Squared Error (MSE) + +One method involves minimizing the squared difference between $y$ and $h(x)$, then computing their average. In fact, this approach utilizes Mean Squared Error (MSE) as the cost function (Equation 2). + +
+{{}} +J(\theta_1, \theta_0) = \left[ \frac{1}{2m} \sum_{i=1}^{m} (y(i) - h_\theta(x(i)))^2 \right] +{{}} + :Equation 2 +
+ + This cost function is widely used for optimizing the majority of regression problems. +For a more intuitive understanding, let's consider the following example. + +{{% example %}} + +for simplicity, let's assume $\theta_0=0$, implying that the equation line passes through the origin $(0,0)$. Our goal is to fit the equation $h(x)=\theta_1*x$ to the sample data showed by red crosses in the following plots. We examine different values for $\theta_1$ such as $1, 0.5, -0.5$ to observe their efffect on the cost function $J(\theta_1)$. while $h_\theta(x)$ is a function of $x$, the cost function is dependent on $\theta_1$. The plots below illustrate how changes in $\theta_1$ values affect the cost function. Note that we need to find th optimal $\theta_1$ that minimize cost function. +

+ +

+

+ +

+ +{{% /example %}} + +In real-world scenarios, it's common to have more than one predictors. Therefore, the cost function dependents on multiple variables. In the case of a two-dimensional problem, the cost function takes on a form similar to the following: +

+ +

+ + +The contour plot below illustrates the two-dimensional cost function, representing the mapping of the previous plot onto a 2D surface. Each contour on this plot corresponds to the same cost value for different $\theta$ values. In machine learning, the goal is to navigate towards the smallest contour on this plot, indicating the minimum cost value. Essentially, when $\theta_1$ and $\theta_0$ are set in a way that the line aligns with the data trend, the cost value converges towards the center (minimum value). Conversely, when the objective function is not aligned with the data trend, the cost value is situated farther from the center on the contour levels. This alignment process is crucial in achieving optimal parameter values for effective model fitting. + +
+ + +Contour map on cost function for two dimentional problem. + (Andrew Ng) +

+
+ +{{% tip %}} +- MSE is also referred to as L2 loss. +- MSE is notably sensitive to outliers. In the presence of outliers within the dataset, the error tends to increase exponentially. +{{% /tip %}} +
+{{}} +J(\theta_1, \theta_0) = \left[ \frac{1}{2m} \sum_{i=1}^{m} (y(i) - h_\theta(x(i)))^2 \right] +{{}} +
+ +### Ridge regression and lasso +There are two options available for mitigating overfitting in MSE: +- Applying L2 regularization to the MSE equation. +- Applying L1 regularization to the MSE equation. + +When the L2 regularization term is added to the MSE, the resulting model is referred to as Ridge regression. In Ridge regression, the model faces a penalty if the coefficients become large,the regularization term acts to diminish the impact of coefficients, mitigating the risk of overfitting. The $\lambda$ term control the degree of regularization, with an increase in $\lambda$ intensifying regularization; typically, the default value is set to 1. In bellow formula $\lambda \sum_{j=1}^{p} \theta_j^2$ is the regularization term. + +
+{{}} +Min \sum_{i=1}^{n} \left(y_i - \sum_{j=1}^{p} x_{ij}\theta_j\right)^2 + \lambda \sum_{j=1}^{p} \theta_j^2 +{{}} +
+ +{{% tip %}} + +As the regularization strength increases, the coefficients' ($\theta$) decrease, yet they never reach zero. + +{{% /tip %}} + +By adding L1 regularization penalty term to the MSE, the equation is called lasso, short for Least Absolute Shrinkage and Selection Operator. Similar to the previous regurarization method, parameter $\lambda$ control the strength of regularization. But in this method many weights being precisely set to zero. +Note that L1 regularization favors sparse models by encouraging coefficients to be exactly zero, facilitating automatic feature selection where some features are entirely ignored. + +
+{{}} +Min \sum_{i=1}^{n} \left(y_i - \sum_{j=1}^{p} x_{ij}\theta_j\right)^2 + \lambda \sum_{j=1}^{p} |\theta_j| +{{}} +
+ + +### Mean Absolute Error(MAE) +This loss function also known as L1 loss. It gives the average of absolute errors of all data samples. In contrast to MSE that get the average of squared error, it takes the average of the absolute error. This characteristic makes it robust to the outliers. Therefore it is more suitable if your datasets contains outliers or noises. + +
+{{}} +MAE = \frac{1}{n}\sum_{i=1}^{n} |y_i - \hat{y}_i| +{{}} +
+ +{{% tip %}} + +Although MAE is less sentative to outliers compared to the MSE, some times MSE is preffered since MAE is not diffrentiable. + +{{% /tip %}} + +### Smooth Mean Absolute Error or Huber Loss +This cost function is the mixture of both previous cost functions. Hence, it is differentiable when the error is small by using the MSE part, and it uses MAE when the error is large. Therefore is less sensitive to the outliers and noise and differentiable close to the minima. There is a hyperparameter to tune it when use MSE or MAE. + + + +
+{{}} +\begin{cases}{} +\frac{1}{2} (y_i - \hat{y}_i)^2 & \text{for } |y_i - \hat{y}_i| \leq \delta \\\ +\delta \cdot |y_i - \hat{y}_i| - \frac{1}{2} \delta^2 & \text{otherwise} +\end{cases} +{{}} +
+ + + + +## Cost function in classification + +Cost functions used in classification differ from those in regression models, primarily due to the nature of the target variable. In regression models, the target variable is continuous, whereas in classification problems, it is discrete. The most common cost function in classification is the cross-entropy loss, which comes in two variations: Binary Cross-Entropy for binary classification and Categorical Cross-Entropy for multiclass classification. Another commonly used cost function in classification is Hinge loss. + +### Cost Function for Binary Classification Tasks +In machine learning problems where the target variable has two classes, Binary Cross-Entropy is used. This loss function is alternatively referred to as log loss, logistic loss, and maximum likelihood. Additionally, L1/L2 regularization terms can be incorporated into this loss function. + +
+{{}} +J(\theta) = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log(h_\theta(x^{(i)})) + (1 - y^{(i)}) \log(1 - h_\theta(x^{(i)})) \right] +{{}} +
+ +### Cost Function for Multiclassification models + +In a classification problem with multiple classes as the target variable, it is necessary to use categorical cross-entropy as the loss function. + +
+{{}} +\text{Categorical Cross-Entropy} = -\frac{1}{n}\sum_{i=1}^{n}\sum_{j=1}^{m}y_{ij} \cdot \log(\hat{y}_{ij}) +{{}} +
+ +### Hing loss +Another alternative is hinge loss, also referred to as Multi-class SVM Loss. This particular cost function is designed to maximize the marginal distance and is commonly used for optimizing Support Vector Machine (SVM) models. + +
+{{}} +\text{Hinge Loss} = \max(0, 1 - y_i \cdot \hat{y}_i) +{{}} +
+ +The Hing loss equals zero when $x_i$ is positioned correctly with respect to the margin(it means when the prediction is correct). However, for data located on the incorrect side of the margin, the function's value is directly proportional to the distance from the margin. + +By adding a regularization term to the equation, the parameter $\lambda > 0$ plays a crucial role in achieving a balance between enlarging the margin size and ensuring that $\max(0, 1 - y_i \cdot \hat{y}_i)$ is satisfied, guaranteeing that $x_i$ lies on the correct side of the margin. + +
+{{}} +\text{SVM Cost Function} = \frac{1}{n} \sum_{i=1}^{n} \left[ \max(0, 1 - y_i \cdot \hat{y}_i) + \lambda \lVert \mathbf{w} \rVert^2 \right] +{{}} +
+ + +{{% example %}} +Consider optimizing a logistic regression problem involving two classes, denoted as 0 and 1. In this context, the objective function is represented by the sigmoid function, which is outlined as follows: + +
+{{}} +h_\theta(x) = \frac{1}{1 + e^{-\theta \cdot x}} +for j=0 , j=1 +{{}} +Equation 3 +
+ +and suitable cost function is Binary Cross-Entropy. + + +The next figure visualizes the cost function across various combinations of the predicted values($h_\theta(x)$) and actual labels when $y$ is either $0$ or $1$. This visualization provides insights into how the cost varies with different scenarios. For instance, when predicted value is $0$ while the actual value is $1$, the cost function goes towards infinity. + +

+ +

+{{% /example %}} + + +{{% summary %}} + * Definition of Cost and Loss function + * Cost functions aplicabple in regression problems + - Mean Squared Error: Measures the average squared difference between predicted and actual values. + - Ridge regresssion: Introduces regularization by penalizing large coefficients to prevent overfitting. + - Lasso Regression: Another regularization technique penalizing the absolute size of coefficients. + - Mean Absolute Error : Computes the average absolute difference between predicted and actual values. + - Smooth Mean Absolute Error: An improved version of MAE with a smooth transition for small errors. + * Cost functions applicable in classification problems + - Binary Cross-Entropy: Used in binary classification tasks to quantify the difference between predicted and actual class probabilities. + - Categorical Cross-Entropy: Suitable for multi-class classification, calculates the difference between predicted and actual class probabilities. + - Hing Loss: Commonly used in support vector machines (SVMs) for binary classification, penalizing misclassified samples. + + * provided examples to enhance understanding of the concepts. + +{{% /summary %}} diff --git a/content/topics/Analyze/machine-learning/supervised/images/2_dim_costfunction.png b/content/topics/Analyze/machine-learning/supervised/images/2_dim_costfunction.png new file mode 100644 index 000000000..5aba45f07 Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/2_dim_costfunction.png differ diff --git a/content/topics/Analyze/machine-learning/supervised/images/classificationcostfunction.png b/content/topics/Analyze/machine-learning/supervised/images/classificationcostfunction.png new file mode 100644 index 000000000..3e168705d Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/classificationcostfunction.png differ diff --git a/content/topics/Analyze/machine-learning/supervised/images/costfunctionplot_1.png b/content/topics/Analyze/machine-learning/supervised/images/costfunctionplot_1.png new file mode 100644 index 000000000..93332342f Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/costfunctionplot_1.png differ diff --git a/content/topics/Analyze/machine-learning/supervised/images/costfunctionplot_2.png b/content/topics/Analyze/machine-learning/supervised/images/costfunctionplot_2.png new file mode 100644 index 000000000..12c8ee5ca Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/costfunctionplot_2.png differ diff --git a/content/topics/Analyze/machine-learning/supervised/images/countourmap_2d.jpeg b/content/topics/Analyze/machine-learning/supervised/images/countourmap_2d.jpeg new file mode 100644 index 000000000..aa78123a2 Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/countourmap_2d.jpeg differ diff --git a/content/topics/Analyze/machine-learning/supervised/images/hypothesis_function.png b/content/topics/Analyze/machine-learning/supervised/images/hypothesis_function.png new file mode 100644 index 000000000..6eb09091f Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/hypothesis_function.png differ diff --git a/content/topics/Analyze/machine-learning/supervised/images/poly.png b/content/topics/Analyze/machine-learning/supervised/images/poly.png new file mode 100644 index 000000000..557b6e4c1 Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/poly.png differ diff --git a/content/topics/Analyze/machine-learning/supervised/images/scatter_iris.png b/content/topics/Analyze/machine-learning/supervised/images/scatter_iris.png new file mode 100644 index 000000000..d5d8fa16c Binary files /dev/null and b/content/topics/Analyze/machine-learning/supervised/images/scatter_iris.png differ diff --git a/content/topics/Analyze/machine-learning/supervised/svm.md b/content/topics/Analyze/machine-learning/supervised/svm.md new file mode 100644 index 000000000..553c678ca --- /dev/null +++ b/content/topics/Analyze/machine-learning/supervised/svm.md @@ -0,0 +1,360 @@ +--- +title: "Support Vector Machines in Python" +description: "Support Vector Machines (SVM) stand out as one of the most popular and effective machine learning classifiers which allow to accomodate for non-linear class boundaries." +keywords: "SVM, machine learning, Python" +date: 17/02/2024 +weight: #1 +author: "Matteo Zicari" +authorlink: "https://tilburgsciencehub.com/contributors/matteozicari/" +aliases: + - /supportvectormachine + - /machinelearning + - /Python + +--- + + +## Overview + +The objective of this article is to provide a practical guide to [Support Vector Machines](https://scikit-learn.org/stable/modules/svm.html#svm) (SVM) in Python. SVMs are supervised machine learning models that can handle both linear and non-linear class boundaries by selecting the best line (or plane, if not two-dimensional) that divides the prediction space to maximize the margin between the classes we are trying to classify. + + +## Python Application + +### Loading Dataset + +In this application, we will be using the sklearn Iris dataset. The dataset contains three different target variables corresponding to three different species of iris: *setosa* (0), *versicolor* (1), and *virginica* (2). The goal is to use the sepal length and width of each iris to predict its species. + +{{% codeblock %}} +```python +# Importing libraries +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn import datasets +``` +{{% /codeblock %}} + + +{{% codeblock %}} +```python +iris = datasets.load_iris() + +# Transforming 'iris' from an array to a dataframe +iris_df = pd.DataFrame(iris.data, columns = iris.feature_names) + +# Adding a target variable (dependent variable of our model) to the dataframe +iris_df['target'] = iris.target +``` +{{% /codeblock %}} + + +{{% codeblock %}} +```python +# Creation of dataset with only sepal features as dependent variables +iris_df = iris_df.drop(['petal length (cm)', 'petal width (cm)'], axis=1) +``` +{{% /codeblock %}} + +Creating a scatter plot to compare the sepal length and width of different species. + +{{% codeblock %}} +```python +# Creation of dataframes by species +setosa = iris_df[iris_df['target'] == 0] +versicolor = iris_df[iris_df['target'] == 1] +virginica = iris_df[iris_df['target'] == 2] + +# Setting figure size +plt.rcParams['figure.figsize'] = (6, 4) + +# Plotting each dataframe +plt.scatter(setosa['sepal length (cm)'], setosa['sepal width (cm)'], color='#003f5c', label='Setosa') +plt.scatter(versicolor['sepal length (cm)'], versicolor['sepal width (cm)'], color='#ffa600', label='Versicolor') +plt.scatter(virginica['sepal length (cm)'], virginica['sepal width (cm)'], color='green', label='Virginica') + +# Scatter plot settings +plt.xlabel('Sepal Length (cm)') +plt.ylabel('Sepal Width (cm)') +plt.title('Sepal Length vs Sepal Width (by species)') +plt.legend() + +``` +{{% /codeblock %}} + +

+ +

+ + +From the graph above, it's evident that an iris setosa can be easily distinguished based on its sepal length and width. However, for the other two species, the division boundary appears to be far from linear, indicating the need for further analysis. + + +### Training and Test Data + +The following code snippet splits the dataset into two parts, one containing the input features (X) and the other containing the target variable (y). + +{{% codeblock %}} +```python +X, y = iris_df.iloc[:, :2], iris_df.target +``` +{{% /codeblock %}} + +Then, the code further splits X and y into training and testing sets using the *train_test_split* function from the *sklearn.model_selection* module. + +{{% codeblock %}} +```python +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21) +``` +{{% /codeblock %}} + + + +### Model Fitting + +In this section of the article, we will apply a support vector machine algorithm to the dataset and tune the hyperparameters to identify the combination that yields the highest prediction accuracy. + +A paramount step in this implementation is the selection of a [kernel function](https://scikit-learn.org/stable/modules/svm.html#svm-kernels), which defines the shape (e.g., linear or non-linear) of the decision boundary. The most commonly used kernels are the linear, radial basis function (rbf), and polynomial (poly) kernel. Given the non-linearity of our decision boundary (as seen in the scatter plot above), we will be employing the *rbf* and *poly* kernels and assess which of the two performs better. + +General implementation: + +{{% codeblock %}} +```python +from sklearn.svm import SVC + +# SVM model +svm_model = SVC(kernel='rbf', C=1, random_state=42) # poly can be used instead of rbf + +# Fitting the model +svm_model.fit(X_train, y_train) +``` +{{% /codeblock %}} + +### Hyperparameters Tuning + +The code above features several self-selected parameters (e.g, kernel, C) that serve as inputs for the model. Tuning these parameters involves substituting different instances of them into the model to evaluate how prediction accuracy varies. The final objective is to select the model with the highest prediction accuracy. + +*RBF Kernel* + +{{% codeblock %}} +```python +from sklearn.metrics import accuracy_score + +sigma = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30] +C = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30] + +accuracy = list() +sigma_c = list() + +for each_s in sigma: + for each_c in C: + svm_model = SVC(kernel='rbf', gamma=1/(2*(each_s**2)), C=each_c, random_state=42) + svm_model.fit(X_train, y_train) + y_pred = svm_model.predict(X_test) + accuracy.append(accuracy_score(y_test, y_pred)) + sigma_c.append((each_s, each_c)) +``` +{{% /codeblock %}} + + +{{% tip %}} + +Hyperparameters: + +- *C* = penalty parameter for misclassification. A smaller value of C results in higher potential misclassification but in lower potential overfitting. + +- *gamma* = influence of a single training example. A smaller gamma implies that points that are further apart are considered similar, hence, the influence of each training example is less localised. Higher values of gamma may lead to overfitting. + +{{% /tip %}} + + +{{% codeblock %}} +```python +# Identifying highest accuracy +index = np.argmax(accuracy) + +# Identifying optimal parameters +sigma_opt, c_opt = sigma_c[index] + +print(sigma_opt) +print(c_opt) +``` +{{% /codeblock %}} + + + +{{% codeblock %}} +```python +sigma = sigma_opt +gamma = 1/(2*sigma_opt**2) +C = c_opt + +# SVM model with optimal parameters +optimal_svm_rbf = SVC(kernel='rbf', gamma=gamma, C=C, random_state=42) +optimal_svm_rbf.fit(X_train, y_train) + +# Training set prediction and accuracy +y_pred_train = optimal_svm_rbf.predict(X_train) +train_accuracy_rbf = accuracy_score(y_train, y_pred_train) + +# Test set prediction and accuracy +y_pred_test = optimal_svm_rbf.predict(X_test) +test_accuracy_rbf = accuracy_score(y_test, y_pred_test) + +print(train_accuracy_rbf) +print(test_accuracy_rbf) +``` +{{% /codeblock %}} + + + +*Poly Kernel* + + +{{% codeblock %}} +```python +degree = [1, 2, 3, 4, 5] +C = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30] + +accuracy = list() +d_c = list() + +for each_d in degree: + for each_c in C: + svm_model = SVC(kernel='poly', degree = each_d, C=each_c, random_state=42) + svm_model.fit(X_train, y_train) + y_pred = svm_model.predict(X_test) + accuracy.append(accuracy_score(y_test, y_pred)) + d_c.append((each_d, each_c)) +``` +{{% /codeblock %}} + +{{% tip %}} + +Hyperparameters: + +- *C* = penalty parameter for misclassification. A smaller value of C results in higher potential misclassification but in lower potential overfitting. + +- *degree* = degree of the polynomial function used to map the input data in the prediction space. The higher the degree, the greater the flexibility of the model and the potential for overfitting. + + +{{% /tip %}} + + +{{% codeblock %}} +```python +# Identifying highest accuracy +index = np.argmax(accuracy) + +# Identifying optimal parameters +d_opt, c_opt = d_c[index] + +print(d_opt) +print(c_opt) +``` +{{% /codeblock %}} + + +{{% codeblock %}} +```python +degree = d_opt +C = c_opt + +# SVM model with optimal parameters +optimal_svm_poly = SVC(kernel='poly', degree=degree, C=C, random_state=42) +optimal_svm_poly.fit(X_train, y_train) + +# Training set prediction and accuracy +y_pred_train = optimal_svm_poly.predict(X_train) +train_accuracy_poly = accuracy_score(y_train, y_pred_train) + +# Test set prediction and accuracy +y_pred_test = optimal_svm_poly.predict(X_test) +test_accuracy_poly = accuracy_score(y_test, y_pred_test) + +print(train_accuracy_poly) +print(test_accuracy_poly) +``` +{{% /codeblock %}} + + +{{% codeblock %}} +```python +print(f'RBF Training Accuracy: {train_accuracy_rbf}') #0.87 +print(f'POLY Training Accuracy: {train_accuracy_poly}') #0.84 +print(f'RBF Test Accuracy: {test_accuracy_rbf}') #0.77 +print(f'POLY Test Accuracy: {test_accuracy_poly}') #0.80 +``` +{{% /codeblock %}} + +As a result of employing the two different SVM models, one with the *rbf* kernel and the other with the *poly* kernel, we observe that the former performs slightly better on the training sample, while the latter performs slightly better on the test sample. Given a test set accuracy of 80%, indicating a higher ability to generalise to unseen data, the *poly* kernel (degree = 2 and C = 0.03) is the optimal choice. + +The following is a plot of the SVM classifier using the optimal *poly* kernel. + +{{% codeblock %}} +```python +from matplotlib.colors import ListedColormap + +# Settings +x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1 +y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1 + +xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), + np.arange(y_min, y_max, 0.02)) +Z = optimal_svm_poly.predict(np.c_[xx.ravel(), yy.ravel()]) +Z = Z.reshape(xx.shape) + +plt.figure(figsize=(6, 4)) + +# Define colours for each class +setosa_color = '#003f5c' +versicolor_color = '#ffa600' +virginica_color = 'green' + +colors = [setosa_color, versicolor_color, virginica_color] +cmap = ListedColormap(colors) + +# Plot decision boundary and colour zones using custom colormap +plt.contourf(xx, yy, Z, alpha=0.2, cmap=cmap) + +# Scatter plot for each class +setosa = iris_df[iris_df['target'] == 0] +versicolor = iris_df[iris_df['target'] == 1] +virginica = iris_df[iris_df['target'] == 2] + +plt.scatter(setosa['sepal length (cm)'], setosa['sepal width (cm)'], color='#003f5c', label='Setosa') +plt.scatter(versicolor['sepal length (cm)'], versicolor['sepal width (cm)'], color='#ffa600', label='Versicolor') +plt.scatter(virginica['sepal length (cm)'], virginica['sepal width (cm)'], color='green', label='Virginica') + +# Plot decision boundary lines +plt.contour(xx, yy, Z, colors='k', linewidths=1, alpha=0.5) + +# Add labels, title, and legend +plt.xlabel('Sepal Length (cm)') +plt.ylabel('Sepal Width (cm)') +plt.suptitle('Decision boundary of poly kernel') +plt.title('degree = 2, C = 0.03', fontsize=8) +plt.legend() + +# Show the plot +plt.show() + +``` +{{% /codeblock %}} + +

+ +

+ +{{% summary %}} +This article introduces Support Vector Machines in Python by providing: + +- a step-by-step practical application to employing the algorithm; +- guide on hyperparameters tuning for selection of the best-performing model. + + +{{% /summary %}} + + diff --git a/content/topics/Automation/Replicability/cloud-computing/colab-github.md b/content/topics/Automation/Replicability/cloud-computing/colab-github.md new file mode 100644 index 000000000..28e51b34f --- /dev/null +++ b/content/topics/Automation/Replicability/cloud-computing/colab-github.md @@ -0,0 +1,156 @@ +--- +title: "Combine GitHub and Google Colab for Collaborative Development" +description: "Explore how integrating GitHub with Google Colab can streamline your development workflow, enabling more efficient project management and collaboration." +keywords: "GitHub, Google Colab, Collaboration, Project Management, Development Workflow, Git Commands, Persistent Storage, Project Visibility" +weight: 3 +author: "Fernando Iscar" +authorlink: "https://www.linkedin.com/in/fernando-iscar/" +draft: false +date: 2024-03-25T10:00:00+00:00 +aliases: + - /github_colab/integration +--- + +## Introduction + +This building block aims to enhance your development process by illustrating the synergistic relationship between GitHub and Google Colab. It's designed for students and researchers looking to optimize their workflow for more efficient and collaborative project management. + +{{% tip %}} +If you're a newcomer to Google Colab, might be worth to give a read to [this short introduction](https://tilburgsciencehub.com/topics/automation/replicability/cloud-computing/google-colab/) before diving into its integration with GitHub. + +{{% /tip %}} + +By the end of this guide, you will: + +- Understand how to import GitHub repositories into Google Colab. +- Be familiar with executing Git commands and pushing changes directly from Colab. +- Learn strategies to handle large files. + +## Setting Up the Workspace + +### Importing GitHub Repositories into Colab + +Colab offers a seamless method to clone your GitHub repository into the environment, allowing you to work directly on your projects without switching platforms. This integration simplifies accessing and working on your code. + +To set up the workspace, you'll first need a GitHub repository to work from, which can be either public or private. Also, make sure you are logged in into your Google Account. Then, do the following: + +**1.** Go to [Google Colab](https://colab.google/) and click on *'Open Colab'*. + +**2.** In the *'File'* menu, select *'Open notebook'*, then go to the *'GitHub'* tab. You can enter the URL of your repository or search for it using your GitHub username. Include private repositories if necessary by clicking on the respective option. + +**3.** After finding your repository, click on the notebook you want to open. + +

+ +

Opening a notebook from GitHub in Google Colab.
+

+ +Once you have the notebook open in Google Colab, you can start working directly on it. However, if you need access to other files or directories within your GitHub repository, like a dataset, cloning it might be necessary. This can be done by executing a git clone command in a cell: + +```bash +!git clone https://github.com/your-username/your-repository.git +``` +After doing this, go to the files tab and press the refresh button, as shown below: +

+ +

Clone to access the notebook's GitHub repo
+

+ +{{% warning %}} +**Cloning private repos** + +If you want to clone a private repository, you will need to provide your GitHub username and password. To do this securely, you can use Git credentials or SSH keys. Here's how you can clone a private repository using Git credentials: + +1. Generate a personal access token (PAT) on GitHub. You can follow the instructions [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) to generate a PAT. +2. In the Colab notebook, execute the following command, replacing `your-username` and `your-repository` with your GitHub username and repository name: + +```bash +!git clone https://your-PAT:x-oauth-basic@github.com/your-username/your-repository.git +``` +{{% /warning %}} + +## Working with GitHub in Colab + +### Executing Basic Git Commands + +Google Colab's environment allows for the execution of Git commands, enabling version control operations directly within your notebook. This feature is crucial for tracking changes, managing branches, and collaborating on projects hosted on GitHub. + +Some of the basic Git commands you can execute in Colab include: + +- `!git status` to check the status of your repository. +- `!git add .` to stage all changes in the repository. +- `!git commit -m "Your commit message"` to commit the staged changes. +- `!git push` to push committed changes to the remote repository. + +Using commands within the Colab interface can often be the most straightforward method for certain tasks, such as cloning a repository. However, depending on the situation, utilizing the features of either GitHub or Colab may offer the most convenience and efficiency. For detailed guidance on employing git commands within Colab, please refer to [this article](https://medium.com/analytics-vidhya/how-to-use-google-colab-with-github-via-google-drive-68efb23a42d). + +### Pushing Changes Using the Colab Interface + +In addition to executing Git commands directly in Colab, you can also use the Colab interface to push changes to your GitHub repository. This provides a more user-friendly and visual way to manage your commits and push them to the remote repository. + +To push changes using the Colab interface, follow these steps: + +1. Make sure you have made the necessary changes to your notebook or files. +2. In the Colab menu, click on *'File'* and select *'Save a copy in GitHub'*. +3. A dialog box will appear, allowing you to specify the repository, branch, and commit message. Fill in the required information and click on *'OK'*. + +

+ +

Dialog box for saving a copy in GitHub.
+

+ +{{% tip %}} +To make it easier for you and your collaborators to access the notebook directly from the GitHub repository, it is recommended to tick the box *'Include a link to Colab'*. This way, you can simply go to the notebook file and click on the following icon to launch it: + +

+ +

Click here to launch the notebook from the repo
+

+ +{{% /tip %}} + +4. Colab will create a new commit with your changes and push it to the specified repository and branch. + +## Other relevant information + +### Integrating with Other Google Services + +#### Mounting Google Drive for Persistent Storage + +As mentioned in [this building block](https://tilburgsciencehub.com/topics/automation/replicability/cloud-computing/google-colab/), mounting your Google Drive in Google Colab is a good practice when working with large files. It provides convenient access to files, datasets, and resources stored in your Google Drive within the Colab environment. + +Benefits of using Google Drive in Colab include: +- Storage of large files that exceed Git repository limitations. +- Easy collaboration and sharing with team members or collaborators. +- Persistent storage, ensuring accessibility across Colab sessions. + +#### Use Google Cloud Buckets + +Another option for storage management in Google Colab is to use Google Cloud Storage Buckets. These are a scalable and durable object storage service provided by Google Cloud Platform. You can find more information in [this building block](https://tilburgsciencehub.com/topics/collect-store/data-storage/commercial-cloud/mem-storage-gcp/). + +### Consider sharing with GitHub Gists + +Colab also offers the option to save a copy as a GitHub Gist. Gists are ideal for quick sharing, when code is too small for creating a repository, and can be embedded in blogs or documents. Keep in mind: + +- Gists are public by default. Use private repositories or Google Drive for sensitive content. +- For creating and managing gists, consult the [GitHub Gist documentation](https://docs.github.com/en/github/writing-on-github/creating-gists). + +### Leveraging Colab's GPUs + +Colab offers free limited access to powerful GPUs, enhancing computational capabilities for data processing and machine learning tasks. To enable a GPU: + +1. Go to *'Runtime' > 'Change runtime type'* in the Colab menu. +2. Select *'T4 GPU'* as the hardware accelerator and save. +3. Verify GPU activation by executing: `!nvidia-smi` in a new cell. This command will output details about the GPU assigned to your session, including its type, memory usage, and the processes running on it. If a GPU is available, you will see its specifications. If not, you might receive an error message indicating no NVIDIA GPU is detected. + +{{% summary %}} + +This topic covers the steps to clone a repository, work with GitHub in Colab, execute basic Git commands, and push changes using the Colab interface. Additionally, it suggests other resources to enhance the collaboration experience. + +{{% /summary %}} + +## Additional Resources + +- [Google Cloud Storage Documentation](https://cloud.google.com/storage/docs) +- [Google Colab Documentation](https://colab.research.google.com/notebooks/intro.ipynb) +- [GitHub and Colab Demo](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb) \ No newline at end of file diff --git a/content/topics/Automation/Replicability/cloud-computing/images/clone-colab-repo.png b/content/topics/Automation/Replicability/cloud-computing/images/clone-colab-repo.png new file mode 100644 index 000000000..d1262a312 Binary files /dev/null and b/content/topics/Automation/Replicability/cloud-computing/images/clone-colab-repo.png differ diff --git a/content/topics/Automation/Replicability/cloud-computing/images/colab-link.png b/content/topics/Automation/Replicability/cloud-computing/images/colab-link.png new file mode 100644 index 000000000..0eb2e1ec4 Binary files /dev/null and b/content/topics/Automation/Replicability/cloud-computing/images/colab-link.png differ diff --git a/content/topics/Automation/Replicability/cloud-computing/images/open-nb-colab.png b/content/topics/Automation/Replicability/cloud-computing/images/open-nb-colab.png new file mode 100644 index 000000000..42fb22c03 Binary files /dev/null and b/content/topics/Automation/Replicability/cloud-computing/images/open-nb-colab.png differ diff --git a/content/topics/Automation/Replicability/cloud-computing/images/save-copy-dialog.png b/content/topics/Automation/Replicability/cloud-computing/images/save-copy-dialog.png new file mode 100644 index 000000000..0a590db7e Binary files /dev/null and b/content/topics/Automation/Replicability/cloud-computing/images/save-copy-dialog.png differ diff --git a/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/checklist.md b/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/checklist.md index e422d8f92..bdfe2a6f6 100644 --- a/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/checklist.md +++ b/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/checklist.md @@ -26,35 +26,35 @@ Here's a checklist you can use to audit your progress. | | data-preparation | analysis | paper | ... | | ------------------------------------------------------------------------|:--------------:|:-----------:|:-----------:|:-------:| | **At the project level** -| Implement a consistent [directory structure](../directories/#working-example): data/src/gen -| Include [readme with project description](../documenting-code/#main-project-documentation) and technical instruction how to run/build the project -| Store any authentication credentials outside of the repository (e.g., in a JSON file), NOT clear-text in source code -| Mirror your `/data` folder to a secure backup location; alternatively, store all raw data on a secure server and download relevant files to `/data` +| Implement a consistent [directory structure](../directories/#working-example):
data/src/gen +| Include [readme with project description](../documenting-code/#main-project-documentation) and
technical instruction how to run/build the project +| Store any authentication credentials outside of the repository
(e.g., in a JSON file), NOT clear-text in source code +| Mirror your `/data` folder to a secure backup location;
alternatively, store all raw data on a secure server and
download relevant files to `/data` | | **At the level of each stage of your pipeline** | *File/directory structure* -| Create [subdirectory for source code](../directories/#working-example): `/src/[pipeline-stage-name]/` | ☐ | ☐ | ☐ | ☐ | | -| Create [subdirectories for generated files](../directories/#working-example) in `/gen/[pipeline-stage-name]/`: `temp`, `output`, and `audit`. | ☐ | ☐ | ☐ | ☐ | | -| Make all file names relative, and not absolute (i.e., never refer to C:/mydata/myproject, but only use relative paths, e.g., ../output) | ☐ | ☐ | ☐ | ☐ | | -| Create directory structure from within your source code, or use .gitkeep | ☐ | ☐ | ☐ | ☐ | | +| Create [subdirectory for source code](../directories/#working-example):
`/src/[pipeline-stage-name]/` | ☐ | ☐ | ☐ | ☐ | | +| Create [subdirectories for generated files](../directories/#working-example)
in `/gen/[pipeline-stage-name]/`: `temp`, `output`, and `audit`. | ☐ | ☐ | ☐ | ☐ | | +| Make all file names relative, and not absolute
(i.e., never refer to C:/mydata/myproject,
but only use relative paths, e.g., ../output) | ☐ | ☐ | ☐ | ☐ | | +| Create directory structure
from within your source code, or use .gitkeep | ☐ | ☐ | ☐ | ☐ | | | *Automation and Documentation* | Have a [`makefile`](../automation) | ☐ | ☐ | ☐ | ☐ | | | Alternatively, include a [readme with running instructions](../automation/#are-there-alternatives-to-make) | ☐ | ☐ | -| Make dependencies between source code and files-to-be-built explicit, so that `make` automatically recognizes when a rule does not need to be run [(properly define targets and source files)](../automation) | ☐ | ☐ | ☐ | ☐ | | -| Include function to delete temp, output files, and audit files in makefile | ☐ | ☐ | ☐ | ☐ | | +| Make dependencies between source code and
files-to-be-built explicit, so that `make`
automatically recognizes when a rule does
not need to be run
([properly define targets and source files](../automation)) | ☐ | ☐ | ☐ | ☐ | | +| Include function to delete temp, output files,
and audit files in makefile | ☐ | ☐ | ☐ | ☐ | | | *Versioning* -| Version all source code stored in `/src` (i.e., add to Git/GitHub) | ☐ | ☐ | ☐ | ☐ | | -| Do not version any files in `/data` and `/gen` (i.e., do NOT add them to Git/GitHub) | ☐ | ☐ | ☐ | ☐ | | -| Want to exclude additional files (e.g., files that (unintentionally) get written to `/src`? Use .gitignore for files/directories that need not to be versioned | ☐ | ☐ | ☐ | ☐ | | +| Version all source code stored
in `/src` (i.e., add to Git/GitHub) | ☐ | ☐ | ☐ | ☐ | | +| Do not version any files in `/data` and `/gen`
(i.e., do NOT add them to Git/GitHub) | ☐ | ☐ | ☐ | ☐ | | +| Want to exclude additional files (e.g., files that (unintentionally)
get written to `/src`? Use .gitignore for files/directories
that need not to be versioned | ☐ | ☐ | ☐ | ☐ | | | *Housekeeping* | Have short and accessible variable names | ☐ | ☐ | ☐ | ☐ | | | Loop what can be looped | ☐ | ☐ | ☐ | ☐ | | -| Break down "long" source code in subprograms/functions, or split script in multiple smaller scripts | ☐ | ☐ | ☐ | ☐ | | -| Delete what can be deleted (including unnecessary comments, legacy calls to packages/libraries, variables) | ☐ | ☐ | ☐ | ☐ | | -| Use of asserts (i.e., make your program crash if it encounters an error which is not recognized as an error)| ☐ | ☐ | ☐ | ☐ | | +| Break down "long" source code in subprograms/functions,
or split script in multiple smaller scripts | ☐ | ☐ | ☐ | ☐ | | +| Delete what can be deleted (including unnecessary
comments, legacy calls to packages/libraries, variables) | ☐ | ☐ | ☐ | ☐ | | +| Use of asserts (i.e., make your program crash if it
encounters an error which is not recognized as an error)| ☐ | ☐ | ☐ | ☐ | | | *Testing for portability* -| Tested on own computer (entirely wipe `/gen`, re-build the entire project using `make`) | ☐ | ☐ | ☐ | ☐ | | -| Tested on own computer (first clone to new directory, then re-build the entire project using `make`) | ☐ | ☐ | ☐ | ☐ | | +| Tested on own computer (entirely wipe
`/gen`, re-build the entire project using `make`) | ☐ | ☐ | ☐ | ☐ | | +| Tested on own computer (first clone to new
directory, then re-build the entire project using `make`) | ☐ | ☐ | ☐ | ☐ | | | Tested on different computer (Windows) | ☐ | ☐ | ☐ | ☐ | | | Tested on different computer (Mac) | ☐ | ☐ | ☐ | ☐ | | | Tested on different computer (Linux) | ☐ | ☐ | ☐ | ☐ | | diff --git a/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/documenting-data.md b/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/documenting-data.md index 99b9b7ec2..e7a7b6f48 100644 --- a/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/documenting-data.md +++ b/content/topics/Automation/Workflows/Starting/principles-of-project-setup-and-workflow-management/documenting-data.md @@ -34,4 +34,4 @@ names and associated labels (e.g., as in the case of countries --> GDP per capit {{% /tip %}} -Check out [our building block for documenting data](/topics/store-and-document-your-data/document-data/documenting-new-data/)! +Check out [our topic for documenting data](/document/new-data)! diff --git a/content/topics/Automation/automation-tools/Makefiles/airbnb-workflow/automating_workflows.md b/content/topics/Automation/automation-tools/Makefiles/airbnb-workflow/automating_workflows.md index 5561baf40..981efd833 100644 --- a/content/topics/Automation/automation-tools/Makefiles/airbnb-workflow/automating_workflows.md +++ b/content/topics/Automation/automation-tools/Makefiles/airbnb-workflow/automating_workflows.md @@ -16,11 +16,11 @@ Up to this point, you should have created the following five R scripts: | File | Description | Phase | | ---- | ---------- | ------ | -| `download.R` | Downloads the data from Inside Airbnb and stores as csv format | Input | -| `clean.R` | Preprocesses the raw data into an aggregated format ready for analysis and visualisation | Transformation | -| `pivot_table.R` | Create a pivot table for the number of reviews by region across time | Transformation | -| `plot_all.R` | Create a line chart for the total number of reviews in a city across time | Output | -| `plot_Amsterdam.R` | Create a line chart for the number of reviews for the top 3 neighborhoods in Amsterdam | Output | +| `download.R` | Downloads the data from
Inside Airbnb and
stores as csv format | Input | +| `clean.R` | Preprocesses the raw data
into an aggregated format
ready for analysis and
visualisation | Transformation | +| `pivot_table.R` | Create a pivot table for
the number of reviews by
region across time | Transformation | +| `plot_all.R` | Create a line chart for
the total number of reviews
in a city across time | Output | +| `plot_Amsterdam.R` | Create a line chart for
the number of reviews for
the top 3 neighborhoods
in Amsterdam | Output | {{% /summary %}} diff --git a/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/finish.md b/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/finish.md index 49f0235fb..59144d25c 100644 --- a/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/finish.md +++ b/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/finish.md @@ -30,8 +30,7 @@ Oh, and to be complete, here's a summary of what you've learnt. Can you believe you all did this? See below for the (long list) -- You've cloned our GitHub template for a reproducible textmining -workflow ([https://github.com/hannesdatta/textmining-workflow](https://github.com/hannesdatta/textmining-workflow)) +- You've cloned [our GitHub template for a reproducible textmining workflow](https://github.com/hannesdatta/textmining-workflow) - You've verified your software setup (and probably spent a lot of time fixing it!) - You've downloaded our template and ran your first workflow, diff --git a/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/pipeline-automation-overview.md b/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/pipeline-automation-overview.md index 1787ec529..880c1b6cb 100644 --- a/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/pipeline-automation-overview.md +++ b/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/pipeline-automation-overview.md @@ -30,23 +30,23 @@ Longing to put your knowledge from our [workflow guide](/topics/reproducible-res ## Prerequisites -- Computer setup following our [setup instructions](/topics/configure-your-computer/). - - [Python](/topics/configure-your-computer/statistics-and-computation/python/) and the `textblob` package +- Computer setup following our [setup instructions](/topics/computer-setup/software-installation/#software-installation-ezo/). + - [Python](/install/python) and the `textblob` package ``` pip install -U textblob ``` - Then, open Python (`python`) and type + Then, open Python in the terminal by typing `python`, and type ``` import nltk nltk.download('punkt') ``` - If you receive an error message, please verify you are typing this command in python (opened on the terminal by typing `python`), and not *directly* in the terminal/Anaconda prompt. + If you receive an error message, please verify you are typing this command in python, and not *directly* in the terminal/Anaconda prompt. - - [R, RStudio](/topics/configure-your-computer/statistics-and-computation/r/) and the following packages: + - [R, RStudio](/install/r) and the following packages: ``` install.packages(c("data.table", "knitr", "Rcpp", "ggplot2", "rmarkdown")) @@ -67,7 +67,7 @@ Longing to put your knowledge from our [workflow guide](/topics/reproducible-res - If you're being asked to install RTools, please do follow these installation instructions. {{% /warning %}} - - [GNU Make](/topics/configure-your-computer/automation-and-workflows/make/) + - [GNU Make](/install/make) - Familiarity with our [workflows](/topics/reproducible-research-and-automation/principles-of-project-setup-and-workflow-management/project-setup-overview/), in particular on [pipelines and project components](/topics/project-management/principles-of-project-setup-and-workflow-management/pipeline/), [directory structure](/topics/project-management/principles-of-project-setup-and-workflow-management/directories/) and [pipeline automation](/topics/project-management/principles-of-project-setup-and-workflow-management/automation/). @@ -76,7 +76,7 @@ Longing to put your knowledge from our [workflow guide](/topics/reproducible-res - Familiarity with common data operations using `data.table` in R - Familiarity with text mining using Python and TextBlob - If you want to learn Git on the way... - - Have Git installed on your computer (see here) + - Have Git installed on your computer (see [here](/install/git)) - Have GitHub login credentials ## Disclaimer diff --git a/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/verify.md b/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/verify.md index ffdd3c0d6..f31fa70d2 100644 --- a/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/verify.md +++ b/content/topics/Automation/automation-tools/Makefiles/practicing-pipeline-automation-make/verify.md @@ -13,7 +13,7 @@ aliases: ## Let's verify whether everything is properly installed -Before we're getting started, let's verify whether you have properly followed our [setup guide](/topics/configure-your-computer/) for R, Python and `make` in order to be able to run our workflow. +Before we're getting started, let's verify whether you have properly followed our [setup guide](topics/computer-setup/software-installation/#software-installation-ezo/) for R, Python and `make` in order to be able to run our workflow. Our tutorial clips are recorded on Windows, but have been tested on a Mac, too. diff --git a/content/topics/Collect-store/data-storage/commercial-cloud/download-data.md b/content/topics/Collect-store/data-storage/commercial-cloud/download-data.md index dd66da075..917ac387b 100644 --- a/content/topics/Collect-store/data-storage/commercial-cloud/download-data.md +++ b/content/topics/Collect-store/data-storage/commercial-cloud/download-data.md @@ -1,10 +1,12 @@ --- title: "Download Data Programmatically" -description: "Learn how to download data right from its (online) source and store it locally with code." +description: "Learn how to download data directly from its online source and save it locally using code. Also, explore how to download and upload data from your local machine to Google Drive" keywords: "download, import data, store, collect, terminal, workflow, programatical download" -#date: 2021-02-08 +#date: 2024-03-09 draft: false weight: 3 +author: "Kheiry Sohooli" +authorlink: "https://tilburgsciencehub.com/contributors/kheirysohooli/" aliases: - /store/data - /store/data-programmatically @@ -91,16 +93,10 @@ data = pandas.read_csv("DOWNLOAD_URL") ``` {{% /codeblock %}} -### Downloading data from Google Drive +### Downloading and uploading data from Google Drive The Google Drive API offers a way to programmatically download and upload files through, for example, a Python script. Keep in mind that this only works for files stored in your own Google Drive account (i.e., your own files and those shared with you). - -{{% warning %}} -Unfortunately, the procedure described in the first code snippet does not work for Google Drive sharing links. The steps below may require some set-up time and technical know-how, but they can be re-used for a variety of cloud services. -{{% /warning %}} - - #### Google Cloud Platform Like Amazon and Microsoft, Google offers cloud services (e.g., databases and compute resources) that you can configure through an [online console](https://console.cloud.google.com/home). In Google Cloud Platform you can also enable services like the Google Drive API, which we'll make use of here. Follow the steps below to get started. @@ -112,29 +108,29 @@ Google offers a 90-day trial with a €300 credit to use, but you can keep on us 2. Click on "Create New Project", give it a project name (e.g., `GoogleDriveFiles`), and click on "Create". -![new-project](../images/new_project.png) +![new-project]("../images/new_project.png") 3. Next, we need to set up a so-called OAuth2 client account which is a widely used protocol for authentication and authorization of API services. - * In the left-side bar click on "APIs & Services" > "OAuth consent screen". + * From the navigation menu in the left top click on "APIs & Services" > "OAuth consent screen". * Set the user type to "External" and click on "Create". - * Give your app a name (can be anything) and fill out a support and developer email address. Click "Save and continue" (it may sometimes throw an app error, then just try again!). + * Give your app a name (can be anything) and fill out a support and developer email addresses. Click "Save and continue" (it may sometimes throw an app error, then just try again!). * Click "Save and continue" twice and then "Back to dashboard". * Click on "Publish app" and "Confirm". - * In the left sidebar, click on "Credentials" and then "Create Credentials" > "OAuth client ID" > "Desktop app" and click on "Create". It will show you your client ID and client secret in a pop-up screen. Rather than copying them from here, we will download a JSON file that contains our credentials. Click on "OK" and then on the download symbol: + * In the left sidebar, click on "Credentials" and then "Create Credentials" > "OAuth client ID". Then, from "application type" menu "Desktop app" and click on "Create". It will show you your client ID and client secret in a pop-up screen. Rather than copying them from here, we will download a JSON file that contains our credentials. Click on "OK" and then on the download symbol: ![download-credentials](../images/download_credentials.png) * Rename the file to `client_secret.json` and store it in the same folder as the scripts you'll use to download and upload the files. 4. By default, the Google Drive API is not activated, look for it in search bar and click on "Enable". -5. Download the following [Python script](https://github.com/RoyKlaasseBos/tsh-website/blob/master/content/topics/store-and-document-your-data/store-data/google_drive.py) ("Raw" > "Save as") and put it in the same directory as the client secret. +5. Download google_drive.py from [this link](https://github.com/tilburgsciencehub/website/tree/master/content/topics/Collect-store/data-storage/commercial-cloud). Then click on "Raw" > "Save as" and store it in the same directory as the client secret. 6. Run the following command to install the Google Client library: {{% codeblock %}} ```bash -pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib +pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib ``` {{% /codeblock %}} @@ -158,10 +154,10 @@ service = create_service(CLIENT_SECRET_FILE, API_NAME, API_VERSION, SCOPES) ``` {{% /codeblock %}} -The first time a new window may pop up that asks you to authenticate yourself with your Google account. Click on "Allow". +The first time a new window may pop up that asks you to authenticate yourself with your Google account. Select the same account that you used to sign in to the Google Cloud platform. Click on "Allow". ![authenticate](../images/authenticate_Drive.png) -Depending on whether you'd like to download or upload a file, follow one of both approaches: +Depending on whether you'd like to download or upload a file, follow one of two approaches: #### Download a file {{% codeblock %}} @@ -184,21 +180,26 @@ with open("", 'wb') as f: f.close() ``` ```R +# install `googledrive` library if it is not installed already +install.packages("googledrive") + +#load `googledrive` library library(googledrive) -data_id <-"12rVWmiA6r8A1ggyP_T3jM7hONUkXoL3h" -drive_download(as_id(data_id), path = “out_file.csv”, overwrite = TRUE) -df <- read.csv(“out_file.csv”) + +data_id <-"" +drive_download(as_id(data_id), path = “”, overwrite = TRUE) +df <- read.extension(“”) ``` {{% /codeblock %}} -* You can find the ``/`data_id` by navigating towards the file in your browser and clicking on "Open in new window". The URL then contains the file ID you need. For example, the file ID of `https://drive.google.com/file/d/XXXXXX/view` is `XXXXXX`. -![open_new_window](../images/open_new_window.png) +* You can find the ``/`` by navigating towards the file in your browser and clicking on "share", then "Get link". The URL contains the file ID(data_id) you need. For example, the file ID of `https://drive.google.com/file/d/XXXXXX/view` is `XXXXXX`. +![Share](../images/share_link.png) * R may ask *"Is it OK to cache OAuth access credentials in the folder `path/gargle/gargle/Cache between R sessions?`"*. - + - * Type in 1 in the R console to accept. A new window may pop up that asks you to authenticate yourself with your Google account. Click on “Allow”. + * Type in 1 in the R console to accept. A new window will pop up that asks you to authenticate yourself with your Google account. Click on “Allow” and then "continue". @@ -211,7 +212,9 @@ df <- read.csv(“out_file.csv”) {{% codeblock %}} ```python file_metadata = { + ## provide the file name on your local machine "name": "", + ## folder Id on your google drive "parents": [""] } @@ -225,8 +228,19 @@ service.files().create( ``` {{% /codeblock %}} -* The `` can be obtained in a similar way as the `: navigate towards the folder where you'd like to save the file and look for the identifier within the URL. +* You can get the in a similar way to how you obtain the . Navigate towards the folder where you'd like to save the file and look for the identifier within the "Shared link". For example, the folder id of `https://drive.google.com/drive/folders/XXXXXXXXX?usp=share_link` is `XXXXXXXXX`. * The `MediaFileUpload()` function assumes that the file supposed to be uploaded is stored in the current directory. If not, add the subdirectory in which the file is stored to the path. -* `` informs Google Drive about the type of file to be uploaded (e.g., `csv`, `jpg`, `txt`). You can find a list of common MIME types over [here](https://learndataanalysis.org/commonly-used-mime-types/). For example, for a csv file it is: `text/csv`. +* `` informs Google Drive about the type of file to be uploaded (e.g., `csv`, `jpg`, `txt`). You can find a list of common MIME types over [here](https://learndataanalysis.org/commonly-used-mime-types/). For example, the MIME type for a CSV file is `text/csv`, and for a PDF file, it is `application/pdf`. + + +{{% summary %}} + +- **Downloading files**: we explored how to download files from URLs and store them locally using both `R` and `Python`, as well as how to download files directly from the terminal. +- **Opening files**: Learn how to access data directly via URL using `R` and `Python` +- **Downloading and uploading data from google drive** + - using google drive API to handle downloads and uploads data. + - Implementing `Python` and `R` scripts to download data from Google Drive and upload it back to Google Drive from a local machine. +{{% /summary %}} + diff --git a/content/topics/Collect-store/data-storage/commercial-cloud/images/R_OAuth_access.PNG b/content/topics/Collect-store/data-storage/commercial-cloud/images/R_OAuth_access.PNG new file mode 100644 index 000000000..3201d9333 Binary files /dev/null and b/content/topics/Collect-store/data-storage/commercial-cloud/images/R_OAuth_access.PNG differ diff --git a/content/topics/Collect-store/data-storage/commercial-cloud/images/share_link.png b/content/topics/Collect-store/data-storage/commercial-cloud/images/share_link.png new file mode 100644 index 000000000..ac08cb56a Binary files /dev/null and b/content/topics/Collect-store/data-storage/commercial-cloud/images/share_link.png differ diff --git a/content/topics/Collect-store/data-storage/commercial-cloud/images/tidyverseAPI_login.PNG b/content/topics/Collect-store/data-storage/commercial-cloud/images/tidyverseAPI_login.PNG new file mode 100644 index 000000000..bd2a1cdfd Binary files /dev/null and b/content/topics/Collect-store/data-storage/commercial-cloud/images/tidyverseAPI_login.PNG differ diff --git a/content/topics/Collect-store/data-storage/databases/_index.md b/content/topics/Collect-store/data-storage/databases/_index.md new file mode 100644 index 000000000..d01f41510 --- /dev/null +++ b/content/topics/Collect-store/data-storage/databases/_index.md @@ -0,0 +1,6 @@ +--- +draft: false +title: "Databases" +weight: 3 +type: subcategory +--- \ No newline at end of file diff --git a/content/topics/Collect-store/data-storage/databases/csv-vs-database.md b/content/topics/Collect-store/data-storage/databases/csv-vs-database.md new file mode 100644 index 000000000..3a2b0518d --- /dev/null +++ b/content/topics/Collect-store/data-storage/databases/csv-vs-database.md @@ -0,0 +1,78 @@ +--- +title: "From CSV to SQL Databases: Matching Data Solutions to Your Needs" +description: "Understand the pros and cons of CSV vs SQL databases, to help finding the most suitable data storage solution tailored to your need" +keywords: "research data management, efficiency, SQL database, CSV, JSON, XML" +#date: 2021-02-08 +draft: false +weight: 1 +author: "Maliheh Mahlouji" +aliases: +--- + +## Overview + +One of the most important decisions at the beginning of a data-intensive project is to choose the right format for data storage. +In this article, we are trying to shed a light on the pros and cons of a simple CSV/JSON/XML vs a SQL database to help you make the best decision +depending on your requirements. + + +There are a lot of aspects you should consider before choosing between simple CSV/JSON/XML formats and a SQL database. Below we inspect some that worth considering. + +## How big the data is? + +If you are dealing with large volumes of data, databases are much more efficient. One reason is that CSV would need to read all data before analysing it. +However, with databases, you can query only the portion that you need, and if you also apply indexing, the database will only search within those indexes which is much faster. + +In addition, if you foresee your data to grow exponentially in the future, then databases make it possible to scale vertically or horizontally (though the latter is easier done in [NoSQL](https://www.coursera.org/articles/nosql-vs-sql) databases). + +## Do you need to combine various datasets with complex filters and joins? + +If your data is complex enough to have relationships within it, for example, customers or products data, databases enable +building relations between tables with the use of foreign keys. And more importantly, it is possible to enforce constraints to the keys to ensure data integrity. + +## Will other people access it at the same time? + +If you are working in a collaborative project where data is accessed and modified concurrently, then databases might be your better bet. +This is because database management systems use transactions to modify a database. If you are curious about how transactions help, +head over to [here](https://www.geeksforgeeks.org/concurrency-control-in-dbms/?ref=lbp) for more details. + + +## Is your data structured? + +If your data is structured and in a tabular format, a SQL database is a good candidate. +But if your data is unstructured, meaning it cannot easily fit in a table, then JSON format offers a good flexibility to store it. +On the other hand, NoSQL databases are also a proper solution for unstructured data if you want to enjoy other advantages that a database can offer. + +## How secure your data need to be? + +Despite CSV and other simple file formats, databases offer built-in user authentication functionalities and can protect your data against unauthorised access. +It is fairly simple to assign groups of users with certain access privileges in a database. In addition, databases enable encryption of data at rest and in transit. + +## Do you need to archive data for a long time? + +One perceived barrier to use databases is the cost it incurs. However, it is useful to know that there are multiple open source databases +such as SQLite that can be locally hosted as well (of course for free). + +Even if the data is hosted in a cloud database, it is always possible to archive it using cheaper options. +For example, you can create a backup of an entire database or a specific table (as a *.sql file) and store it exactly where you would store your CSV files. +Furthermore, it is possible to export your SQL tables as CSV (just make sure the commas within cells are handled properly). + +{{% warning %}} +Given that CSV stands for Comma-Separated Values, it's important to ensure proper handling of commas within cells when exporting a SQL table to CSV. +Failure to do so could lead to confusion where commas are mistaken for column separators. +{{% /warning %}} + +{{% tip %}} + Want to explore long-term data archiving solutions? Check out this [page](http://localhost:62095/topics/collect-store/data-storage/long-term-archiving/choose-a-data-repository/) to learn about various options. +{{% /tip %}} + + + +{{% summary %}} +In this article, we've explored several scenarios where utilizing a database proves advantageous. Nevertheless, +it's important to note that CSVs are lightweight file formats widely embraced by individuals with varying levels of technical proficiency. +And sometimes the data requirements are not heavy that justifies that extra mile. +{{% /summary %}} + + + diff --git a/content/topics/Computer-Setup/software-installation/document-creation/latex.md b/content/topics/Computer-Setup/software-installation/document-creation/latex.md index acc932af5..8d2f04d13 100644 --- a/content/topics/Computer-Setup/software-installation/document-creation/latex.md +++ b/content/topics/Computer-Setup/software-installation/document-creation/latex.md @@ -27,7 +27,7 @@ Download the file `install-tl-windows.exe` from **[here](https://www.tug.org/tex You can install MacTeX from the **[official website](https://www.tug.org/mactex/)** or using [`Homebrew`](/configure/cli): ```bash -brew cask install mactex +brew install mactex --cask ``` ### Linux (Ubuntu-based) diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/images/figure1-import-large-dataset-r.png b/content/topics/Manage-manipulate/Loading/large-datasets/images/figure1-import-large-dataset-r.png index b76afa72c..852f12bd9 100644 Binary files a/content/topics/Manage-manipulate/Loading/large-datasets/images/figure1-import-large-dataset-r.png and b/content/topics/Manage-manipulate/Loading/large-datasets/images/figure1-import-large-dataset-r.png differ diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/images/figure2-import-large-dataset-r.png b/content/topics/Manage-manipulate/Loading/large-datasets/images/figure2-import-large-dataset-r.png index 9af035cbc..3f7c54ea9 100644 Binary files a/content/topics/Manage-manipulate/Loading/large-datasets/images/figure2-import-large-dataset-r.png and b/content/topics/Manage-manipulate/Loading/large-datasets/images/figure2-import-large-dataset-r.png differ diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/images/figure3-import-large-dataset-r.png b/content/topics/Manage-manipulate/Loading/large-datasets/images/figure3-import-large-dataset-r.png deleted file mode 100644 index 940cab5eb..000000000 Binary files a/content/topics/Manage-manipulate/Loading/large-datasets/images/figure3-import-large-dataset-r.png and /dev/null differ diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/images/figure3-writing-large-dataset-r.png b/content/topics/Manage-manipulate/Loading/large-datasets/images/figure3-writing-large-dataset-r.png new file mode 100644 index 000000000..3f7c54ea9 Binary files /dev/null and b/content/topics/Manage-manipulate/Loading/large-datasets/images/figure3-writing-large-dataset-r.png differ diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/images/table-1-import-large-dataset-r.png b/content/topics/Manage-manipulate/Loading/large-datasets/images/table-1-import-large-dataset-r.png new file mode 100644 index 000000000..f62115fd0 Binary files /dev/null and b/content/topics/Manage-manipulate/Loading/large-datasets/images/table-1-import-large-dataset-r.png differ diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/images/table2-import-large-dataset-r.png b/content/topics/Manage-manipulate/Loading/large-datasets/images/table2-import-large-dataset-r.png new file mode 100644 index 000000000..f30d50183 Binary files /dev/null and b/content/topics/Manage-manipulate/Loading/large-datasets/images/table2-import-large-dataset-r.png differ diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/images/table3-import-large-dataset-r.png b/content/topics/Manage-manipulate/Loading/large-datasets/images/table3-import-large-dataset-r.png new file mode 100644 index 000000000..bf55b0378 Binary files /dev/null and b/content/topics/Manage-manipulate/Loading/large-datasets/images/table3-import-large-dataset-r.png differ diff --git a/content/topics/Manage-manipulate/Loading/large-datasets/large-datasets-R.md b/content/topics/Manage-manipulate/Loading/large-datasets/large-datasets-R.md index 0b8843b44..20942279d 100644 --- a/content/topics/Manage-manipulate/Loading/large-datasets/large-datasets-R.md +++ b/content/topics/Manage-manipulate/Loading/large-datasets/large-datasets-R.md @@ -1,8 +1,8 @@ --- title: "Import Large Datasets Into R" -description: "Importing large datasets in R is made easy by data.table R package. The package makes importing and manipulating huge datasets (like big data) in R faster" +description: "Learn how to efficiently import and manage large datasets in R using packages like `data.table`, `readr`, and `vroom`. These tools offer improvements in speed and memory usage, making it easier to work with big data and improve your data preparation workflow." weight: 2 -keywords: "import, data, data preparation, big data, large datsets, memory, RAM, data.table, big data, R, dataframe, object size" +keywords: "import, data, data preparation, big data, large datsets, memory, RAM, data.table, big data, R, dataframe, object size, readr, vroom" date: 2023-10-26 author: "Matthijs ten Tije" draft: false @@ -14,106 +14,123 @@ aliases: This building block provides you with some practical tips for dealing with __large datasets in R__. -Many R users rely on the base R commands `read.csv` or `read.table` to import their datasets as a dataframe. Although this works well for relatively small datasets, we recommend using the `readr` or `data.table` packages instead - simply because it is significantly faster and offers enhanced functionality. +Many R users rely on the base R commands `read.csv` or `read.table` to import their datasets as a dataframe. Although this works well for relatively small datasets, we recommend using one of the following packages instead: the `readr`, `data.table` or the `vroom` package - simply because it is significantly faster and offers enhanced functionality. You can obtain the data for the examples from this [link](http://download.geonames.org/export/zip/GB_full.csv.zip). -## Advantages of the `readr` and `data.table` packages +## Advantages of Using readr, data.table, or vroom Over Base R +When working with large datasets in R, the default `read.csv` method can be less efficient in terms of speed and memory usage. This is where packages like `readr`, `data.table`, and `vroom` come into play, offering a more optimized approach to data import. -Why should you use the `readr` package or the `data.table` package instead of `read.csv`? +### 1. Significant Speed Gains +Efficiency is important when processing large datasets in R. Traditional base R methods, while reliable, do not offer the speed necessary for modern data analysis tasks. This is where `readr`, `data.table`, and `vroom` shine, each providing solutions to significantly reduce data loading times. -### 1. Significant speed gains - -Both `readr` `read_csv` and `data.table`'s `fread()` andfunctions are significantly faster. Take for example, loading in this specific dataset containing 1809903 rows and 12 columns, both functions are 2 or 3 times faster than the base R `read.csv` function. +#### Practical Impact +Benchmarking reveal that all three packages significantly outperform the `base R` function in terms of speed. Specifically, these packages can load the dataset, with 1,809,903 rows and 12 columns, from 3 times up to 11 times faster! {{% codeblock %}} ```R +# Load required libraries library(readr) library(data.table) +library(vroom) -system.time(dt <- read.csv("GB_full.txt")) # Base R -system.time(dt <- read_csv("GB_full.txt")) # readr -system.time(dt <- fread("GB_full.txt")) # data.table -dim(dt) +# Time comparisons for reading a large file +system.time(base_r <- read.delim("GB_full.txt")) +system.time(readr_data <- read_tsv("GB_full.txt")) +system.time(dt_data <- fread("GB_full.txt")) +system.time(vroom_data <- vroom("GB_full.txt", col_names = FALSE)) ``` {{% /codeblock %}} -

- +

Downloading time using different packages

+

+ +

-### 2. Quick data wrangling -`Readr` and `Data.table` are coming shipped with efficient data manipulation functionality for large datasets. +### 2. Quick Data Manipulation +The ability to efficiently manipulate data is as crucial as the import phase. All three packages `readr`, `data.table`, and `vroom` are coming shipped with data manipulation features to improve convienence, speed and memory usage. -Readr Package: -- *Selective Loading*: Only required data is loaded, this targeted appraoch speeds up importing. -- *Automatic Typing*: Columns are auto-specified correctly, avoiding post-load type conversions. -- *Progressive Processing*: Data is processed while loading, not after, for faster access. +`Readr` Package: +- *Selective Loading*: `readr` processes the data while loaded, not after, facilitating faster manipulation. +- *Automatic Typing*: Automatically infers column types, reducing the need for post-import adjustments and speeding up the data preparation process. -Data.table Package: -- *In-Place Modification*: Changes data directly without unnecessary copies, enhancing speed. -- *Keys and Indices*: Setting keys speeds up data searching and sorting. +`Data.table` Package: +- *In-Place Modification*: `data.table` allows for in-place data modification, avoiding the slowdowns caused by copying data. +- *Keys and Indices*: Setting keys for data indexing accelerates sorting and subsetting operations. - *Multithreading*: Utilizes all CPU cores for parallel processing of tasks. -- *Minimal Copying*: Reduces memory usage by avoiding redundant data duplication. +- *Minimal Copying*: Reduces memory usage by avoiding data duplication. -The code example below, shows the power of data.table. Which can be 100 times faster! +`Vroom` Package: +- _Initial Load_: Uses a quick scan to map data locations, reducing initial load times. +- _Lazy Loading_: When filtering and aggregating `vroom` only loads in the only necessary data, increasing speed performance and reducing memory footprint. +- _Capability for Large Datasets_: By querying and subsetting of datasets, avoiding full data materialization, it allows for datasets larger than the memory. -{{% codeblock %}} -```R -library(dplyr) +### Practical impact +In the benchmarks, we explore different data manipulation tasks like printing, head/tail operations, random row sampling, specific row filtering, and aggregating for mean calculations. -system.time(dt %>% group_by("Variable 1") %>% - filter("Variable 2" == "England") %>% - summarise(mean("Variable 3"))) #with dplyr +Using the following methodology: +- _vroom_base_: Uses `vroom` for reading, then `base R` functions for manipulation. +- _vroom_dplyr_: Integrates `vroom` for reading with `dplyr` for manipulation. +- _data.table_: Employs `fread` for reading and `data.table` functions for manipulation. +- _readr_dplyr_: Uses `readr` for reading and `dplyr` for manipulation. +- _base_R_: Relies on standard `base R` functions throughout. -system.time(dt["Variable 2" =="England", mean("Variable 3"), by = "Variable 1"]) -``` -{{% /codeblock %}} +The table below represents the running times observed for different data manipulation packages:

- -

Running time using different packages
+ +
Running time using different packages for data manipulation

-### 3. Boost file writing performance +

+ +

-Both `readr` and `data.table` packages are massively quicker when writing files compared to `write.csv()`. +### 3. Increases File Writing Performance +When it comes to writing large datasets to files, `readr`, `data.table`, and `vroom` offer increased speed performance compared to `base R`'s `write.csv()` function. -Just try out the `write_csv` function of `readr`, or the `fwrite()` function in `data.table` package. +Here is our benmark test to compare these functions: {{% codeblock %}} ```R # creating a 1 million by 10 data frame df <- data.frame(matrix(rnorm(1000000*10), 1000000,10)) -system.time({write.csv(my_df, "base.csv", row.names=FALSE) }) # base R -system.time({fwrite(my_df, "datatable.csv") }) # data.table -system.time({write_csv(my_df, "readr.csv") }) # readr +system.time({write.csv(df, "base.csv", row.names=FALSE) }) # base R +system.time({fwrite(df, "datatable.csv") }) # data.table +system.time({write_csv(df, "readr.csv") }) # readr +system.time({vroom_write(df, "vroom.csv")}) # vroom ``` {{% /codeblock %}}

- +

Writing files running time using different packages

+Below is a visual representation of the file writing times across different packages: +

+ +


## Practical Examples +This section walks you through practical examples using the various R packages for data importation of large datasets. ### Importing with Readr `read_csv()` from the `readr` package offers several advantages over the base R function `read.csv()`: -- **Integration and Type Detection**: `read_csv()` works well with other tidyverse packages and intelligently determines the data type for each variable. +- **Integration and Type Detection**: `read_csv()` works well with other `tidyverse` packages and automatically determines the data type for each variable. - **Tibble Output**: Unlike `read.csv()`, which produces a data.frame, `read_csv()` outputs a tibble. Tibbles offer enhanced functionality and behave more consistently when subsetting. - **Character Variable Handling**: `read_csv()` treats text variables as character variables. - In contrast, `read.csv()` automatically converts text to factors, which can be inconvenient. + - In contrast, `read.csv()` automatically converts text to factors, which can be inconvenient. - Overall, `read_csv()` provides a more efficient and user-friendly approach to data handling in R. +Overall, `read_csv()` provides a more efficient and user-friendly approach to data handling in R. {{% codeblock %}} ```R @@ -142,10 +159,9 @@ write_csv(df, "YOUR_CLEANED_DATASET.csv") {{% /tip %}} ### Importing with data.table +Switching from the `read.csv()` function to `fread()` can improve the performance of your program. It is often dubbed the "_fast and friendly file finagler_", and is efficient and straightforward to use in R. -Switching from the `read.csv()` function to `fread()` can greatly improve the performance of your program. It is often dubbed the "fast and friendly file finagler," and is highly efficient and straightforward to use in R. - -One of its key features is that it is designed to import data from *regularly* delimited files directly into R. Here, "regular" implies that each row in your dataset must have the same number of columns. What sets fread() apart is its ability to automatically detect parameters like **sep**, **colClasses**, and **nrows**, making the data import process straightforward. +One of its main features is that it is designed to import data from *regularly* delimited files directly into R. Here, "regular" implies that each row in your dataset must have the same number of columns. What sets `fread()` apart is its ability to automatically detect parameters like **sep**, **colClasses**, and **nrows**, making the data import process straightforward. The code block below illustrates how you can import your whole dataset using `fread()` as well as subsets of it, determine the size of the resulting object, and store your new/clean versions of the data as a new file for future use. @@ -174,6 +190,62 @@ fwrite(df, "YOUR_CLEANED_DATASET.csv") {{% tip %}} +When starting your analysis with large datasets, start with loading a representative sample first. This practical strategy facilitates quicker fine tuning and testing of data processing and analysis scripts. If confident about hte accuarcy, and robustness of your code, scale up to the entire dataset. This approach saves time upfront and ensures a smoother workflow when handling extensive datasets. + +{{% /tip %}} + +### Importing with vroom + +`Vroom` is part of `tidyverse` and is a similar to `readr::read_csv()` or `data.table::fread()`. However, when it comes to raw importing speed performance, `vroom::vroom()` often leads the pack. In the context of the dataset examined within this article, it processes data in approximately one-quarter the time taken by `data.table::fread()`. + +The efficiency of `vroom` is attributed to its method of reading in files. Initially, it only marks the location of each data point, deferring from actual reading of values. Thereafter, it uses, the so-called `Altrep` framework, which creates vectors that reference these marked locations. This two-step procedure makes data acces fast and memory efficient. + +Therefore the main reason `vroom` is faster is because character data is read from the file lazily; it does not read the entire dataset unless required. One major benefit is that it requires no adjustments to your data-manipulation code. However, one disadvantage to `vroom`'s lazy reading method is that it may delay the identification of potential data issues until the point of access. + +Observations from benchmark results show that `vroom`'s initial read is significantly quicker than alternative methods. Routine operations such as `print()`, `head()`, and random sampling execute equally fast as the other packages. However because the character data is read lazily, operations such as filter and summarise, which need character values, require additional time. However, this cost will only occur once. After the values have been read, they will be stored in memory, and subsequent accesses will be as quick as the other packages. + +{{% codeblock %}} +```R +# import package +library(vroom) + +# get path to example file +input_file <- vroom_example("YOUR_DATASET.csv") +input_file + +# import data with the vroom package +# Read from a path +df <- vroom(input_file) +# You can also use your path directly +df <- vroom("YOUR_DATASET.csv") + +# only import the first couple of rows for exploratory analysis +df <- vroom("YOUR_DATASET.csv", nrows=500) + +# only import the data you actually use +df <- vroom("YOUR_DATASET.csv", col_select=c(1, 2, 5)) # column indices +df <- vroom("YOUR_DATASET.csv", col_select=c(date, country, revenue)) # column names + +# store the derivative file for future use +vroom_write(df, "YOUR_CLEANED_DATASET.tsv") +vroom_write(df, "YOUR_CLEANED_DATASET.csv", delim = ",") +``` +{{% /codeblock %}} + +{{% tip %}} + Oftentimes, datasets you previously worked with remain stored in memory, even if they're no longer in use. In RStudio, click on the broom icon in the top right window to remove all objects from the environment. By removing objects which are no longer in use you will help improve RStudio's performance and reduce the risk of errors and conflicts. {{% /tip %}} + +{{% summary %}} + +This article discusses efficient strategies for handling large datasets in R using the `readr`, `data.table`, and `vroom` packages. Key takeaways are: +- `readr`, `data.table`, and `vroom` are faster for data importation, data manipulation and writing files compared to base R functions. +- `readr` offers advantages like selective loading and automatic typing, improving the speed of data manipulation. +- `data.table`s `fread()` has the ability to automatically detect parameters like **sep**, **colClasses**, and **nrows**, making the data import process straightforward. +- `vroom` accelerates initial loads through quick scans and lazy loading, supporting efficient querying and subsetting of large datasets. +- Benchmark comparisons demonstrate the largely improved performance of these packages in both data import and manipulation tasks. +- Practical examples guide you through using each package for importing, exploring, and manipulating large datasets. + +{{% /summary %}} diff --git a/content/topics/Manage-manipulate/manipulate-clean/textual/images/aft-wordcloud.png b/content/topics/Manage-manipulate/manipulate-clean/textual/images/aft-wordcloud.png new file mode 100644 index 000000000..44efb020f Binary files /dev/null and b/content/topics/Manage-manipulate/manipulate-clean/textual/images/aft-wordcloud.png differ diff --git a/content/topics/Manage-manipulate/manipulate-clean/textual/images/bef-wordcloud.png b/content/topics/Manage-manipulate/manipulate-clean/textual/images/bef-wordcloud.png new file mode 100644 index 000000000..c44f555fe Binary files /dev/null and b/content/topics/Manage-manipulate/manipulate-clean/textual/images/bef-wordcloud.png differ diff --git a/content/topics/Manage-manipulate/manipulate-clean/textual/images/diff-stem-lem.png b/content/topics/Manage-manipulate/manipulate-clean/textual/images/diff-stem-lem.png new file mode 100644 index 000000000..e70c217cb Binary files /dev/null and b/content/topics/Manage-manipulate/manipulate-clean/textual/images/diff-stem-lem.png differ diff --git a/content/topics/Manage-manipulate/manipulate-clean/textual/images/results-text-preprocessing.png b/content/topics/Manage-manipulate/manipulate-clean/textual/images/results-text-preprocessing.png new file mode 100644 index 000000000..3c16343a4 Binary files /dev/null and b/content/topics/Manage-manipulate/manipulate-clean/textual/images/results-text-preprocessing.png differ diff --git a/content/topics/Manage-manipulate/manipulate-clean/textual/images/top-50-terms.png b/content/topics/Manage-manipulate/manipulate-clean/textual/images/top-50-terms.png new file mode 100644 index 000000000..9e38ed164 Binary files /dev/null and b/content/topics/Manage-manipulate/manipulate-clean/textual/images/top-50-terms.png differ diff --git a/content/topics/Manage-manipulate/manipulate-clean/textual/text-prep-python.md b/content/topics/Manage-manipulate/manipulate-clean/textual/text-prep-python.md new file mode 100644 index 000000000..a9fe68aa7 --- /dev/null +++ b/content/topics/Manage-manipulate/manipulate-clean/textual/text-prep-python.md @@ -0,0 +1,362 @@ +--- +title: "Text Pre-processing in Python" +description: "Dive into the world of text preprocessing with Python! Learn how to clean, tokenize, and visualize text data for your NLP projects using popular libraries such as pandas, spaCy, and matplotlib" +weight: 2 +author: "Fernando Iscar" +authorlink: "https://www.linkedin.com/in/fernando-iscar/" +draft: false +date: 2023-03-29 +aliases: + - /text-python +--- + +## Overview + +The ability to process and analyze text data is increasingly important in the era of big data. From social media analytics to customer feedback analysis, the insights gained from text data can inform decision-making and reveal trends. However, raw text data is often messy and unstructured. Preprocessing this data into a clean format is essential for effective analysis. + +This tutorial introduces the fundamental techniques of text preprocessing in Python, utilizing the [pandas](https://pandas.pydata.org/) library for data manipulation, [spaCy](https://spacy.io/) for tokenization and lemmatization, and [matplotlib](https://matplotlib.org/) for data visualization. By the end of this guide, you'll be equipped with some fundamental skills to prepare text data for a Natural Language Processing (NLP) project. + +{{% tip %}} +Are you an R user? Then check out our [Text Pre-processing in R](https://tilburgsciencehub.com/topics/manage-manipulate/manipulate-clean/textual/text-preprocessing/) topic! +{{% /tip %}} + +## Introduction to text preprocessing + +When building machine learning models, we typically work with numeric features as models only understand numerical data. However, there are cases where our features are in categorical text format. In such situations, we need to preprocess and encode these features into numerical format, often referred to as vectors, using techniques like Label Encoding or One Hot Encoding. + +The main challenge arises when our entire feature set is in text format, such as product reviews, tweets, or comments. How do we train our machine learning model with text data? As we know, machine learning algorithms only work with numeric inputs. + +### Natural Language Processing + +NLP is a branch of Artificial Intelligence (AI) that enables computers to understand, interpret, manipulate, and respond to human language. In simple terms, NLP allows computers to comprehend human language. + +In order to effectively preprocess text data, it is important to understand key techniques such as tokenization, stemming, and lemmatization. These techniques play a crucial role in breaking down text into smaller units, reducing words to their base form, and producing valid words respectively. + +Python provides several libraries for Natural Language Processing (NLP), including [NLTK](https://www.nltk.org/) and [spaCy](https://spacy.io/). Both are widely used in the NLP community and offer powerful features for text preprocessing and analysis in Python. + +- `NLTK` (Natural Language Toolkit) is a popular library that offers a wide range of tools and resources for tasks such as tokenization, stemming, lemmatization, and more. + +- `spaCy` is a modern and efficient library that provides advanced NLP capabilities, including tokenization, part-of-speech tagging, named entity recognition, and dependency parsing. + +Let's dive deeper into these NLP concepts. + +### Tokenization + +Tokenization is the process of breaking down a text into smaller units called tokens. These tokens can be words, sentences, or even characters, depending on the level of granularity required. Tokenization is an essential step in text preprocessing as it forms the basis for further analysis and manipulation of text data. + +- **Example:** + +{{% codeblock %}} +```python +import spacy + +# Load the spaCy model +nlp = spacy.load("en_core_web_sm") + +# Sample text +text = "Here's an example of tokenization: breaking down text into individual words!" + +# Tokenize the text +doc = nlp(text) + +# Extract tokens +tokens = [token.text for token in doc] + +print(tokens) +``` +{{% /codeblock %}} + +The `output` will be a list of tokens: `["Here", "'s", "an", "example", "of", "tokenization", ":", "breaking", "down", "text", "into", "individual", "words", "!"]`. Notice how punctuation and spaces are treated as separate tokens, which is typical in word tokenization. + +### Stemming + +Stemming is a technique used to reduce words to their base or root form, known as the stem. It involves removing suffixes and prefixes from words to obtain the core meaning. Stemming helps in reducing the dimensionality of text data and can be useful in tasks such as information retrieval and text classification. + +- **Example:** + +{{% codeblock %}} +```python +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize + +# Initialize the stemmer +stemmer = PorterStemmer() + +# Sample text +text = "The boys are playing football. One boy is injured." + +# Tokenize the text +tokens = word_tokenize(text) + +# Stem each token +stemmed_tokens = [stemmer.stem(token) for token in tokens] + +print(stemmed_tokens) + +``` +{{% /codeblock %}} + +The `output` might look like `['The', 'boy', 'are', 'play', 'footbal', '.', 'One', 'boy', 'is', 'injur', '.']`, demonstrating how stemming simplifies words to their roots, albeit not always in a grammatically correct form. + +### Lemmatization + +Lemmatization, unlike stemming, reduces words to their base or dictionary form, known as the lemma. It involves a more sophisticated analysis of a word's morphology to arrive at its simplest form, which ensures that the result is a valid word. + +- **Example:** + +{{% codeblock %}} +```python +import spacy + +# Load the spaCy model +nlp = spacy.load("en_core_web_sm") + +# Sample text +text = "The boys are playing football. One boy is injured." + +# Tokenize and lemmatize the text +doc = nlp(text) + +# Extract lemmatized tokens +lemmatized_tokens = [token.lemma_ for token in doc] + +print(lemmatized_tokens) + +``` +{{% /codeblock %}} + +The `output` will be: `['the', 'boy', 'be', 'play', 'football', '.', 'one', 'boy', 'be', 'injure', '.']`. Here, verbs like "are" and "is" are lemmatized to "be", and "injured" to "injure", ensuring the result is grammatically viable. + +

+ +

Difference between Stemming and Lemmatization
+

+ +## Practical Example + +In this tutorial, we will explore the process of loading, cleaning, and preprocessing text data from a dataset containing subReddit descriptions. SubReddits are specific communities within the larger [Reddit platform](https://edu.gcfglobal.org/en/thenow/what-is-reddit/1/#), focusing on various topics such as machine learning or personal finance. + +### Loading data + +First, we'll use the `pandas` library to load our dataset: + +{{% codeblock %}} +```python +import pandas as pd + +# Example path might need to be changed to your specific file location +file_path = 'subreddit_descriptions.csv' + +# Load the dataset +df = pd.read_csv(file_path) + +# Preview the first few rows of the dataframe +print(df.head()) +``` +{{% /codeblock %}} + +### Cleaning the data + +Before diving into preprocessing, it's crucial to clean our data. This step often involves removing or filling missing values, eliminating duplicate entries, and possibly filtering out irrelevant columns. + +{{% codeblock %}} +```python +# Drop duplicate descriptions +df.drop_duplicates(subset='description', inplace=True) + +# Drop rows with missing descriptions +df.dropna(subset=['description'], inplace=True) + +# Reset the index after the drop operations +df.reset_index(drop=True, inplace=True) + +print("Data after cleaning:") +print(df.head()) +``` +{{% /codeblock %}} + +### Visualizing Data Before Preprocessing + +With the data loaded and cleaned, we can get a visual sense of the text we're working with. This initial look will help identify common terms that could skew our analysis if left unchecked. + +#### Word Cloud + +Creating a word cloud from the raw text data gives us a visual feast of the most prominent words. In this colorful representation, it's easy to spot Reddit-centric language and other non-informative stopwords that we'll want to remove to refine our analysis. + +{{% codeblock %}} +```python +from wordcloud import WordCloud +import matplotlib.pyplot as plt + +# Combine all descriptions into a single string +text_combined = ' '.join(df['description']) + +# Generate and display the word cloud +wordcloud = WordCloud(max_words=1000, background_color='white').generate(text_combined) +plt.figure(figsize=(8, 6)) +plt.imshow(wordcloud, interpolation='bilinear') +plt.axis('off') +plt.show() + +``` +{{% /codeblock %}} + +

+ +

Word cloud before text preprocessing
+

+ +#### Top Terms + +The bar plot provides a clear visualization of the most frequent terms in our dataset. It is evident that terms like "subreddit" and "community" are highly prevalent, along with common stopwords such as *"and"* or *"the"*: + +{{% codeblock %}} +```python +from collections import Counter + +# Split the combined text into words and count them +words = text_combined.split() +word_counts = Counter(words) + +# Get the top 50 most common words +top_50_words = word_counts.most_common(50) +words, frequencies = zip(*top_50_words) + +# Create the bar plot +plt.figure(figsize=(10, 8)) +plt.barh(range(len(top_50_words)), frequencies, tick_label=words) +plt.gca().invert_yaxis() # Invert y-axis to have the highest count at the top +plt.xlabel('Frequency') +plt.title('Top 50 Terms in Descriptions') +plt.show() + +``` +{{% /codeblock %}} + +

+ +

Top 50 most frequent terms before text pre-processing
+

+ +The visuals underscore the necessity of preprocessing: we need to filter out the noise to uncover the true signal in our text data. By removing some Reddit-specific jargon (*subreddit*, *community*, etc.) and stopwords (*and*, *the*, *for*, etc.), tokenizing, and lemmatizing the text, we can focus our analysis on words that carry the most meaning. + +### Text Preprocessing + +The preprocessing function defined below performs the following actions to clean and standardize the text data: + +- Standardizes Capitalization: Converts all text to lowercase to ensure uniformity. +- Removes Noise: Strips out URLs and special characters, retaining only significant textual elements. +- Simplifies Text: Reduces repetition of characters to prevent distortion of word frequency and meaning. +- Processes Text: Utilizes `spaCy` to tokenize and lemmatize the text, filtering out stopwords and punctuation for cleaner tokens. +- Corrects Spelling: Applies a spell-checking process to tokens that appear misspelled due to character repetition. + + +{{% codeblock %}} +```python +import spacy +import re +from spellchecker import SpellChecker + +# Load the spaCy model +nlp = spacy.load('en_core_web_sm') + +# Define the set of custom stop words specific to Reddit +reddit_stop_words = {'subreddit', 'community', 'discussion', 'share', 'welcome'} +for word in reddit_stop_words: + # Add each custom stop word to spaCy's vocabulary so they will be recognized as stop words + nlp.vocab[word].is_stop = True + +# Define the text preprocessing function +def preprocess_text(text): + # Convert to lowercase to normalize the case + text = text.lower() + + # Remove URLs + text = re.sub(r'https?://\S+|www\.\S+', '', text) + + # Remove special characters, keeping only words and basic punctuation + text = re.sub(r'[^a-zA-Z0-9\s,.?!]', '', text) + + # Reduce excessive character repetition to a maximum of two occurrences + text = re.sub(r'(.)\1{2,}', r'\1\1', text) + + # Tokenize and lemmatize the text, removing stop words, punctuation, and short words + doc = nlp(text) + tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and len(token.text) > 2] + + # Correct tokens with repeated characters using a spell checker + spell = SpellChecker() + corrected_tokens = [spell.correction(token) if re.search(r'(.)\1', token) else token for token in tokens] + + # Join the tokens back into a single string, removing any potential None values + return " ".join(token for token in corrected_tokens if token is not None and token != '') + +# Apply the preprocessing function to the 'description' column +df['processed_description'] = df['description'].apply(preprocess_text) + +``` +{{% /codeblock %}} + +With this code, each description in the DataFrame will be processed and stored in a new column `processed_description`, which will contain the cleaned and standardized text ready for further NLP tasks or machine learning modeling. + +### Visualize Data After Preprocessing + +Let's test our preprocessing function with few comment examples before and after applying the preprocessing steps: + +{{% codeblock %}} +```python +# Example usage with sample comments +sample_texts = [ + "Here's an example **Çcomment1: I LOVE TILBURG UNIVERSITY and machine learninggg!! 😊 http://example.com", + "OMG, I love New Yorkkkk City so much!! 🍎🚕", + "This is a great Subreddit community! I enjoy reading what you say" +] + +# Print the original and processed texts +for sample_text in sample_texts: + processed_text = preprocess_text(sample_text) + print("Original text:", sample_text) + print("Processed text:", processed_text) + print() + +``` +{{% /codeblock %}} + +

+ +

Comparison of Reddit-like comments before and after text preprocessing
+

+ +We can appreciate our function is doing a good job at cleaning up the sample comments, retaining the most relevant and insightful terms. But let's look at the results on our whole dataset using a word cloud once again: + +{{% codeblock %}} +```python +# Combine all preprocessed descriptions into a single string +preprocessed_text_combined = ' '.join(df['processed_description']) + +# Generate and display the word cloud for preprocessed text +wordcloud = WordCloud(max_words=1000, background_color='white').generate(preprocessed_text_combined) +plt.figure(figsize=(8, 6)) +plt.imshow(wordcloud, interpolation='bilinear') +plt.axis('off') +plt.show() + +``` +{{% /codeblock %}} + +

+ +

Word cloud after text preprocessing
+

+ +After applying text preprocessing techniques, we can observe a shift in the most common terms. Instead of generic words, we now see more specific terms related to entertainment (e.g., *game*, *meme*), personal discussions (e.g., *relate*, *people*), and group identity (e.g., *fan*, *sub*), among others. This refined set of words provides valuable insights for various NLP tasks, such as sentiment analysis, topic modeling, and community analysis. + +By cleaning and standardizing the text, we create a foundation for advanced algorithms like [BERT](https://www.techtarget.com/searchenterpriseai/definition/BERT-language-model#:~:text=BERT%2C%20which%20stands%20for%20Bidirectional,calculated%20based%20upon%20their%20connection.), which can understand the context of these terms, or methods like [TF-IDF](https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/), which can highlight the importance of each term in the corpus. + +These techniques enable us to gain meaningful insights and interpretations from the data. If you're interested in learning more about it, stay tuned for our Machine Learning section! + +## Additional Resources + +- [NLP complete guide](https://www.deeplearning.ai/resources/natural-language-processing) +- [SpellChecker Documentation](https://pypi.org/project/pyspellchecker/) +- [Nine Python NLP libraries](https://sunscrapers.com/blog/9-best-python-natural-language-processing-nlp/) \ No newline at end of file diff --git a/content/topics/Research-skills/Scopus/images/advancedsearch.png b/content/topics/Research-skills/Scopus/images/advancedsearch.png new file mode 100644 index 000000000..37845b4ce Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/advancedsearch.png differ diff --git a/content/topics/Research-skills/Scopus/images/analyzeother .png b/content/topics/Research-skills/Scopus/images/analyzeother .png new file mode 100644 index 000000000..220086547 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/analyzeother .png differ diff --git a/content/topics/Research-skills/Scopus/images/analyzeresults.png b/content/topics/Research-skills/Scopus/images/analyzeresults.png new file mode 100644 index 000000000..d574a5f17 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/analyzeresults.png differ diff --git a/content/topics/Research-skills/Scopus/images/coffeesearch.png b/content/topics/Research-skills/Scopus/images/coffeesearch.png new file mode 100644 index 000000000..aea9515b5 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/coffeesearch.png differ diff --git a/content/topics/Research-skills/Scopus/images/freeaccess.png b/content/topics/Research-skills/Scopus/images/freeaccess.png new file mode 100644 index 000000000..bc646bce1 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/freeaccess.png differ diff --git a/content/topics/Research-skills/Scopus/images/landingpage.png b/content/topics/Research-skills/Scopus/images/landingpage.png new file mode 100644 index 000000000..f713db9c9 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/landingpage.png differ diff --git a/content/topics/Research-skills/Scopus/images/modifyquery.png b/content/topics/Research-skills/Scopus/images/modifyquery.png new file mode 100644 index 000000000..a4928245b Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/modifyquery.png differ diff --git a/content/topics/Research-skills/Scopus/images/paper.png b/content/topics/Research-skills/Scopus/images/paper.png new file mode 100644 index 000000000..e4462b303 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/paper.png differ diff --git a/content/topics/Research-skills/Scopus/images/personalprofile.png b/content/topics/Research-skills/Scopus/images/personalprofile.png new file mode 100644 index 000000000..64c800a11 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/personalprofile.png differ diff --git a/content/topics/Research-skills/Scopus/images/saveoption.png b/content/topics/Research-skills/Scopus/images/saveoption.png new file mode 100644 index 000000000..31a352e78 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/saveoption.png differ diff --git a/content/topics/Research-skills/Scopus/images/savesearch.png b/content/topics/Research-skills/Scopus/images/savesearch.png new file mode 100644 index 000000000..d1129e9b6 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/savesearch.png differ diff --git a/content/topics/Research-skills/Scopus/images/scopushome .png b/content/topics/Research-skills/Scopus/images/scopushome .png new file mode 100644 index 000000000..bacbee925 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/scopushome .png differ diff --git a/content/topics/Research-skills/Scopus/images/searchresults.png b/content/topics/Research-skills/Scopus/images/searchresults.png new file mode 100644 index 000000000..9b5339429 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/searchresults.png differ diff --git a/content/topics/Research-skills/Scopus/images/searchterms.png b/content/topics/Research-skills/Scopus/images/searchterms.png new file mode 100644 index 000000000..c584fdc41 Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/searchterms.png differ diff --git a/content/topics/Research-skills/Scopus/images/unemployment.png b/content/topics/Research-skills/Scopus/images/unemployment.png new file mode 100644 index 000000000..321868f8d Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/unemployment.png differ diff --git a/content/topics/Research-skills/Scopus/images/uniaccess.png b/content/topics/Research-skills/Scopus/images/uniaccess.png new file mode 100644 index 000000000..e47a2dfec Binary files /dev/null and b/content/topics/Research-skills/Scopus/images/uniaccess.png differ diff --git a/content/topics/Research-skills/Scopus/scopus-literature-review.md b/content/topics/Research-skills/Scopus/scopus-literature-review.md new file mode 100644 index 000000000..5be4242cf --- /dev/null +++ b/content/topics/Research-skills/Scopus/scopus-literature-review.md @@ -0,0 +1,290 @@ +--- +tutorialtitle: "Navigating Scopus for Effective Literature Research" +title: "Navigating Scopus for Effective Literature Research" +description: "Beginner's guide to effectively using Scopus for comprehensive literature reviews, benefiting students writing their thesis" +keywords: "Scopus, literature review, thesis, beginner, papers" +weight: 2 +draft: false +author: "Virginia Mirabile" +aliases: +- /learn/scopus +- /topics/research-skills + +--- + +## Introduction + +In the realm of academia, conducting a comprehensive literature review is a crucial step towards building a strong foundation for your scholarly work, whether it's a paper, thesis, or any other research project. As you embark on this journey of knowledge discovery, having the right tools at your disposal is essential, and one such powerful tool is Scopus. + +This tutorial is designed for individuals who are new to Scopus, offering a guided tour through its features and functionalities. Whether you're a student navigating the intricacies of academic research or a newcomer to the world of literature reviews, this tutorial aims to simplify the process of utilizing Scopus to its full potential. + +By the end of this tutorial, you'll not only be comfortable using Scopus but also equipped with the skills to conduct a thorough and well-informed literature review, laying the groundwork for impactful research. + +The structure of this tutorial is as follows: + +- Introduction & access +- Searching for documents +- Advanced search options +- Summary + +Let's embark on this journey together and unlock what Scopus has to offer! + +### What is Scopus? + +Scopus is a comprehensive abstract and citation database designed to facilitate academic research. Developed by Elsevier, Scopus covers a vast range of disciplines, including science, technology, medicine, social sciences, and arts and humanities. It provides researchers, scholars, and students with a centralized platform to access a wealth of scholarly literature, including academic journals, conference proceedings, patents, and other scientific publications. + +You might be asking yourself: why Scopus? Well, as you will see during the course of this tutorial, Scopus is not only a comprehensive database for a variety of disciplines, it also has personalized features that make your research process more efficient and organized. + +### How to access + +Scopus Database use typically requires a subscription or to be granted access through a research institution, university or library. + +Subscription access to Scopus is only available for organizations and enterprises, hence check if that may be the case for your organization. + +There is also an option to use Scopus for free, by scrolling down on the [Elsevier Scopus landing page](https://www.elsevier.com/products/scopus), find the button that says "View Scopus Preview" + +

+ +

+ +### Institutional access + +The most common option to access Scopus is through a university and/or library, either by means of an authentication or just by simply being connected to the Wi-Fi. + +Follow these steps for a smooth start to the Scopus experience: + +1. Make sure you are connected to the university Wi-Fi or the VPN. + +2. Navigate to the [Scopus landing page](https://www.elsevier.com/products/scopus), it should look as follows: + +

+ +

+ +Click on the button "Sign in". + +3. A box should appear, click on the button "Sign in via your organization": + +

+ +

+ +4. Type in the box the name of your university, click on the result. + +5. You will be now asked to authenticate your university account. Once verified, you should have full access to the Scopus Database. + + +On the top left corner of the home page you can verify your institutional access. +On the top right corner you can also check your personal account, where you can explore personalized features that we will dive into later in the tutorial. + +Now that you are set up, we can get into exploring what Scopus has to offer. + +## Introduction to the User Interface + +### Personalized features + +By clicking on your account icon on the top right corner, a menu opens with the following personalized items: + +- Saved lists: allows you to rename, edit, delete, add to or export your saved lists of papers or documents. +- Saved searches: allows you to rename, edit, delete, combine or set an alert for saved searches. You can also run a saved search to view the results since the search was last run. +- Alerts: allows you to edit, delete or change the status of your alerts. You can also check for new results based upon the date that the alert was created. Alerts are personalized notifications that you can set up to be updated when new research comes out on a topic of your interest. +- Export preferences: allows you to choose a preferred file type or reference management tool when exporting documents. This can be very useful when exporting references or citations of the literature you plan to use. +In this section you can select the export settings of your preference, which will then be applied to your search sessions. + +## How to search & analyze results + +Let's start exploring the search features of Scopus! + +For the purpose of this tutorial we will use the topic of unemployment, but you are free to choose whatever you feel more comfortable with. + +Enter **unemployment** in the search box as follows: + +

+ +

+ +- By default, Scopus will search the word(s) in the Article title, Abstract and Keywords of documents. You can specify in which fields to search using the drop-down menu. Some options include: authors, sources, affiliation. For the purpose of this tutorial, we will stick with the default option, but feel free to play around! + +- To expand the search to additional fields, click the "+ Add search field". A new search bar will appear, let's use **long term** for this search. + +Notice that the two searches will be connected by the logical operator AND, meaning that the search aims to include both terms. +Alternatively, you can opt for OR and AND NOT operators, depending on your purpose, but don't worry about this as the topic of logical operators will be covered later in this tutorial. + +- You can also set a specific date range for search results in the following way: + +

+ +

+ +For more information on setting up a search query in Scopus, you can select ‘Search tips.’ + +Click the search button and let's see what the results are! + +### Refining your search + +Below is the overview of the search results. As you can see there is a large number of documents, they might not all be relevant for your research. To this purpose, let's see how you can refine your search in a few easy steps to get closer to the literature you actually need. + +

+ +

+ +On the left side of the page, the column "refine search" has a few filters, we will walk through them and use some which aim at our research: + +- Year: if you haven't set this filter previously, there is still the possibility to do so. +- Author name: In case you are searching by author or combinations of authors. +- Subject area: You can restrict the search to your area of interest, in our case we will stick to **social sciences** +- Document type: Choose among different formats, we will choose **article**. +- Source title +- Publication stage +- Keyword: You can select more keywords to further restrict your search, let's try selecting **unemployment, female and adult**. +- Affiliation +- Funding sponsor +- Country +- Source type +- Language: Select your language of interest, it is sometimes overlooked as a way to reduce the number of documents you will have to look through! +- Open access + +Once you are done with your selection, don't forget to click **Limit to**. + +There are a few more adjustments you can perform to the output of the search. Let's have a look at them. + +{{% tip %}} + +By default, the search results are sorted by date. Use the ‘Sort by’ drop-down menu to sort in a different order. One that might be useful if you are writing a thesis is sorting by highest cited, this allows you to immediately point out reliable sources of literature. + +{{% /tip %}} + +Another feature is the possibility to show the abstract by clicking ‘Show abstract’, useful to have a good first impression of a paper instead of opening it. + +### Analyze results + +By clicking the feature ‘Analyze results’ on a search results page provides an analysis of your search and shows you the number of documents in your results broken down (on separate tabs) by year, source, author, affiliation, country, document type, subject area and funding sponsor. +You can click on individual cards to expand and view additional data. + +

+ +

+ +In the above image you can see the amount of documents published on our topic through time, it depicts a clear increase in research especially in the last decade. +This can be very useful for your research as it points out which years were more prolific for researchers. + +By scrolling down you can inspect additional data. +Authors with the most documents, countries, affiliations and subject areas can be inspected in more detail. + +### Working with a document + +If you find a title or abstract interesting, it is always a good idea to open the page for that document as you can see below: + +

+ +

+ +Scopus offers the following features and insights: + +- Click an author name to go to the details page for that author. +- You can see the button 'View PDF' if you have direct access to the pdf version of the file; otherwise click on 'Full text options' to check other access types. +- View the three most recent documents to cite this article in the top right corner. +- ‘Metrics’ are article level metrics which allow you to evaluate both citation impact and levels of community engagement around an article. +- By scrolling down you can view the ‘References’ cited in this document. The titles link to the abstract pages for those articles. + +### Saving your search + +Keeping track of the literature you come across and that could be useful for your literature review is extremely important. +Scopus offers the possibility to save an article thanks to the 'Save to list' button easily indicated by a star. + +By clicking that, a box will appear asking you to save the document in either a new or existing list as you can see in the image below: + +

+ +

+ +The files will then be stored in the personalized area 'Saved lists' in 'My Scopus' which you can access by clicking on your personal profile: + +

+ +

+ +You can do something similar also with your search query. Navigate back to the search results page: + +

+ +

+ +Click on save search and choose a name to easily identify your query. It will be stored in your personal area under 'Saved searches'. + +## Advanced search + +The advanced search feature allows you to create a more complex search using field codes, proximity operators or Boolean operators. + +For this part of the tutorial let's change our research topic to how does climate change affect coffee production. + +Type in the search bar **coffee** and then click on 'Advanced document search' as you can see below: + +

+ +

+ +{{% tip %}} + +To start this part of the tutorial reset the previous search by removing filters and keywords in the search bar. + +{{% /tip %}} + +The keyword 'coffee' was carried over to the advanced search tab. + +

+ +

+ +On the right side of the page you can explore the different operators, by positioning the cursor on the operator you can get a brief description of its function. By clicking on the '+' a paragraph explaining the function pops up and the operator will also be added to the query. + +Let's start by adding AND to our search, as you type, notice that a drop down menu opens with potential suggestions, follow the operator with 'TITLE-ABS-KEY' and within the parenthesis write **production** so that both will be searched in the title, abstract and keywords of documents. Using the same logic, add also 'climate'. + +This is what our search query should look like: TITLE-ABS-KEY(coffee) AND TITLE-ABS-KEY(production) AND TITLE-ABS-KEY(climate). + +Now that the keywords are defined let's scroll down and explore the filters, which are essentially the same ones as the previous section. + +Let's select: + +- Under document filter, only open access files. +- Subject areas: agricultural and biological sciences (under life sciences) and economics, econometrics and finance (under social sciences). + +Click on 'search', you will be directed to the search results page we are already familiar with. + +If you realize you have limited your results too far you can modify your search, perhaps remove some filters. It is also possible to edit your advanced search string, for example you can remove the 'open access' requirement as your institution might have free access to a lot of resources. + +To view the search string more clearly as you modify it click on 'Outline query', highlight the unwanted items and delete them as you can see below: + +

+ +

+ +## Summary + +{{% summary %}} + +1. **Accessing Scopus**: Check institutional access or use the free preview option on the Elsevier Scopus landing page. + +2. **Utilizing Personalized Features**: Use 'Saved lists', 'Saved searches', and set up 'Alerts' to organize and track research. + +3. **Basic Search Techniques**: Enter keywords and refine search results using filters. + +4. **Analyzing Search Results**: Use 'Analyze results' to gain insights on research trends and evaluate citation impact. + +5. **Saving Relevant Documents**: Use 'Save to list' to save articles and queries for future reference. + +6. **Advanced Search Techniques**: Utilize Boolean operators and field codes to create complex search queries and refine results. + +{{% /summary %}} + +By following these key steps, you can effectively utilize Scopus for comprehensive literature research, making your academic journey smoother and more productive! + + + + + + + + + + diff --git a/content/topics/Research-skills/Writing/writing-process/preparation.md b/content/topics/Research-skills/Writing/writing-process/preparation.md index dfa0a9a6a..8545161bb 100644 --- a/content/topics/Research-skills/Writing/writing-process/preparation.md +++ b/content/topics/Research-skills/Writing/writing-process/preparation.md @@ -133,6 +133,7 @@ The following resources on the Tilburg Science Hub website and beyond can help y #### Learn LaTeX - [Tutorial on LaTeX](/learn/latex/): LaTex is a typesetting system ideal for academic documents, like your thesis, enabling precise formatting and presentation. +- [LaTeX Thesis Template](/get/latex-templates): Get started with our ready-to-use LaTeX Thesis Template. #### Other skills - [Use ChatGPT for your research](/tutorials/more-tutorials/chat-gpt-research/chat-gpt-research) diff --git a/content/topics/Visualization/data-visualization/graphs-charts/data-storytelling-plotly-R.md b/content/topics/Visualization/data-visualization/graphs-charts/data-storytelling-plotly-R.md index c119e0c63..b83374f95 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/data-storytelling-plotly-R.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/data-storytelling-plotly-R.md @@ -3,7 +3,7 @@ title: "Dynamic Data Storytelling with Plotly" description: "Exploration and guidance in the art of creating interactive, narrative-driven data visualizations by using Plotly in R" keywords: "data visualization, Plotly, R, interactive charts, dynamic storytelling, ggplot2, data analysis" date: 2023-23-12 -weight: 7 +weight: 8 author: "Matthijs ten Tije" authorlink: "https://tilburgsciencehub.com/contributors/matthijstentije/" aliases: diff --git a/content/topics/Visualization/data-visualization/graphs-charts/grammar-of-graphics-ggplot2.md b/content/topics/Visualization/data-visualization/graphs-charts/grammar-of-graphics-ggplot2.md index ab07b6596..810bb79f9 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/grammar-of-graphics-ggplot2.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/grammar-of-graphics-ggplot2.md @@ -3,7 +3,7 @@ title: "Grammar of Graphics of ggplot2" description: "Understand the core concepts of zggplot2 - a powerful plotting library for R. Improve your visualizations now." keywords: "ggplot2, Grammar of Graphics, Layering, Data Visualization" date: 11-12-23 -weight: 3 +weight: 2 author: "Matthijs ten Tije" authorlink: "https://tilburgsciencehub.com/contributors/matthijstentije/" aliases: diff --git a/content/topics/Visualization/data-visualization/graphs-charts/matplotlib-seaborn.md b/content/topics/Visualization/data-visualization/graphs-charts/matplotlib-seaborn.md index b3674b10d..0899851e8 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/matplotlib-seaborn.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/matplotlib-seaborn.md @@ -3,7 +3,7 @@ title: "Plotting with Matplotlib and Seaborn in Python" description: "Plotting in Python - comparison between matplotlib and seaborn" keywords: "data, visualization, python, plotting, seaborn, matplotlib" date: 2023-07-19 -weight: 3 +weight: 4 author: "Ana Bianca Luca" authorlink: "https://tilburgsciencehub.com/contributors/anabiancaluca/" aliases: diff --git a/content/topics/Visualization/data-visualization/graphs-charts/plotnine-altair.md b/content/topics/Visualization/data-visualization/graphs-charts/plotnine-altair.md index 181a31c7b..7ec3887fc 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/plotnine-altair.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/plotnine-altair.md @@ -3,7 +3,7 @@ title: "Visualizing data with Plotnine and Altair" description: "Plotting in Python - comparison between plotnine and altair" keywords: "data, visualization, python, plotting, plotnine, altair" date: 2023-07-25 -weight: 2 +weight: 3 author: "Ana Bianca Luca" authorlink: "https://tilburgsciencehub.com/contributors/anabiancaluca/" aliases: diff --git a/content/topics/Visualization/data-visualization/graphs-charts/saving-plots-r.md b/content/topics/Visualization/data-visualization/graphs-charts/saving-plots-r.md new file mode 100644 index 000000000..b1f36fd63 --- /dev/null +++ b/content/topics/Visualization/data-visualization/graphs-charts/saving-plots-r.md @@ -0,0 +1,260 @@ +--- +title: "Plot Saving in R: Techniques and Best Practices" +description: "Explore techniques for saving R plots using ggsave from the ggplot2 package, including dynamic file naming, version control, and directory management, to improve project organization and file management." +keywords: "R, ggsave, ggplot2, plot saving, data visualization, file management, version control, directory management" +date: 11-03-2024 +weight: +author: "Matthijs ten Tije" +authorlink: "https://tilburgsciencehub.com/contributors/matthijstentije/" +aliases: + - /ggsave + - /ggplot2/ggsave +--- + +## Overview +Saving your visualizations is essential for further analysis, sharing, or completing assignments. While copying figures to the clipboard offers a quick solution, saving figures in formats like PNG, JPG, or PDF is a preferable strategy. Transitioning from temporary to permanent storage methods ensures your work remains accessible and intact over time. + +This article focusses on how to efficiently save visualizations in R using just one function. We'll explore the `ggsave()` function from the `ggplot2` package, which is the best practice tool for saving your figures. The `ggsave()` function introduces easy-to-use and customizable saving capabilities by allowing users to directly specify file names, dimensions, and resolution. Beyond the basic syntax for general use, we will delve into figure file management, including time stamping and organized file management strategies within the function. + + +## Saving Plots with Base R +Base `R` provides a simple, device-based approach for saving plots. This method involves three main steps: +1. Opening a graphics device. +2. Plotting your data. +3. Closing the device to finalize the file. + +This process ensures visualizations are stored in a desired format, like PNG, JPG, or PDF. + +### Step 1: Open a Graphics Device +To save a plot, you first need to open a graphics device corresponding to your desired file format. `R` offers a variety of functions for this purpose, allowing you to specify file names and dimensions upfront. Use one of the following commands based on your format preference: + +{{% codeblock %}} + +```R +# Open a Graphics Device +pdf("Your-Graph-Name.pdf", width = 8, height = 6) +png("Your-Graph-Name.png", width = 800, height = 600) +jpeg("Your-Graph-Name.jpg", width = 800, height = 600) +``` +{{% /codeblock %}} + +### Step 2: Generate and Print Your Plot +After opening the appropriate device, first create your plot and second print it using the `print("Your-Graph-Name")`. Printing is necessary to transfer the plot from `R` to the file associated with the graphics device. + +{{% codeblock %}} + +```R +# Generate and Print your Plot +plot(x = mtcars$mpg, y = mtcars$wt, main = "Miles Per Gallon vs. Weight") +``` +{{% /codeblock %}} + +### Step 3: Close the Graphics Device +Finalize your file by closing the graphics device. This step saves and closes the file, ensuring your plot is stored as intended: + +{{% codeblock %}} +```R +dev.off() +``` +{{% /codeblock %}} + +### Example Case: ggplot2 Plot +Here’s an example of how to apply these steps for saving `ggplot2` graphs: + +{{% codeblock %}} +```R +library(ggplot2) +library(gapminder) + +# Generate plots +plot1 <- ggplot(gapminder, + aes(x = gdpPercap, y = lifeExp)) + + geom_point() + + labs(title = "Life Expectancy vs. GDP per Capita", + x = "GDP per Capita", + y = "Life Expectancy") + +plot2 <- ggplot(gapminder, + aes(x = gdpPercap, y = lifeExp, color = continent)) + + geom_point() + + labs(title = "Life Expectancy vs. GDP per Capita by Continent", + x = "GDP per Capita", + y = "Life Expectancy") + +# Save plots to PDF, specifying dimensions +pdf("ggplot_graphs.pdf", width = 8, height = 6) +print(plot1) # First plot +print(plot2) # Second plot +dev.off() + +# Save a plot to PNG, adjusting resolution and size +png("ggplot_graph.png", width = 800, height = 600, res = 150) +print(plot1) # Print the plot + +# Close the Graphics Device +dev.off() +``` + +{{% /codeblock %}} + +{{% tip %}} +_Quick Tip: Why Save Plots as PDFs?_ + +- **Scalability**: PDFs are vector-based, meaning that you can resize plots without losing clarity. +- **Quality Preservation**: PDFs maintain sharpness, avoiding the pixelation common in raster formats like PNG or JPG, making the the preffered option for presentations and detailed analysis. + +{{% /tip %}} + +## Saving Plots with ggsave() +While base `R` allows you to save your plots in 3 steps. The `ggsave()` function from the `ggplot2` package is the best practice for saving your R plots. `ggsave()` allows you to save your plots with just one function. + +For small projects or instances where only a single or a few visualizations is needed, the basic syntax provided by `ggsave()` is sufficient. This simplicity allows for the quick saving of plots without the need for extensive customization, making it an ideal choice for straightforward tasks. + +### Syntax and Argument Overview +`ggsave()` automatically picks the file format from the extension of the provided filename. It defaults to saving the last displayed plot, but you have the flexibility to specify which plot to save: + +{{% codeblock %}} + +```R +ggsave(filename, # use .extension such as .png, .pdf, .jpeg + plot = last_plot(), + path = NULL, + width = NA, + height = NA, + units = c("in", "cm", "mm", "px"), + dpi = 300, + ...) +``` +{{% /codeblock %}} + +Important arguments within the function are: +- _filename_: Name and extension of the output file, dictating the format. + - examples: .png, .pdf, etc. +- _plot_: The ggplot or base R object to save, defaulting to the last plot shown. +- _path_: The directory for saving the file, using the current directory if not specified. +- _width, height_: Dimensions of the output file, with an option to specify units. +- _units_: Measurement units for plot dimensions ("in", "cm", "mm", "px"). +- _dpi_: Resolution for raster formats, specified as dots per inch. + +{{% codeblock %}} + +```R +# Generate a ggplot +mtcars_scatterplot <- ggplot(mtcars, + aes(x = wt, y = mpg)) + + geom_point() + + ggtitle("Fuel Efficiency of Cars") + +# Save the plot as a PNG with custom dimensions, resolution, and background color +ggsave("mtcars_fuel_efficiency.png", + plot = mtcars_scatterplot, + width = 10, + height = 6, + dpi = 300, + units = "in", + bg = "white") +``` + +{{% /codeblock %}} + +## Expanding ggsave() Functionality Beyond Basics +The usage of `ggsave()` can be extended to the integration of more sophisticated techniques for file management. For example, by nesting other `R` functions. In this section, we will discuss some practicalities involving naming conventions, dynamic file naming using time stamping, and directory management. The example cases are building on top of each other, to create a structured `ggsave()` approach for your project's visual outputs. + +### File Naming and Organization +Using a structured naming convention as a habit will be helpful in both project organization and ensuring your work is easily accessible in the future. By adhering to clear naming conventions, you make your files both informative and easy to find. + +#### Principles for File Naming: +- _Descriptive Naming_: Clearly articulate the plot content in filenames (e.g., scatterplot_gdp_vs_life_expectancy.pdf) rather than using non-descriptive titles like figure1.pdf. +- _Compatibility_: Choose filenames that are searchable and compatible across different platforms. Avoid spaces, special characters, or uppercase letters. Using underscores (_) or hyphens (-), and sticking to lowercase letters helps maintain consistency across systems. + +In practice, applying these principles in `ggsave()` might look like this: +{{% codeblock %}} +```R +ggsave( + filename = "scatterplot_gdp_vs_life_expectancy.pdf", + plot = "your_plot_object", + width = 8, + height = 6) +``` +{{% /codeblock %}} + + +{{% tip %}} +Adopting `snake_case` for File Naming + +For R projects, particularly when working with `SQL` databases or using the `tidyverse` package, it is recommended to adopt `snake_case` for naming variables, functions, and files (e.g., scatterplot_gdp_vs_life_expectancy). This practice not only ensures readability and database compatibility but also aligns with the naming conventions of the `tidyverse` package. More general avoid using dots in names to prevent confusion in non-R environments. + +{{% /tip %}} + +### Time Stamping your output with ggsave() +We can use `ggsave()` together with a timestamp in the filename to give each iteration of a graph a unique name. This creates the ability to monitor changes and progress across as a visualization gets updated over time. This can be be beneficial when an older version of a visualization needs to be referred back to. + +#### Adding a Timestamp to the File Name: + +`R` offers built-in functionality that returns the current date and time: `Sys.Date()` for adding a date stamp for daily versioning, and `format(Sys.time(), "%Y%m%d_%H%M%S")` for a more granular timestamp that includes the exact time of creation. + +Automating the naming process to include the timestamp can streamline your workflow. By using `paste0()` in conjunction with `ggsave()` and the `Sys.Date()` or `Sys.time()` syntax you can generate output files that include the time the file was written in their file name. + +Example of implementing a timestamp in a file name with `ggsave()`: + +{{% codeblock %}} +```R +# Incorporate a timestamp directly in ggsave() +ggsave( + filename = paste0("scatterplot_gdp_vs_life_expectancy", format(Sys.Date(), "%Y-%m-%d"), ".pdf"), + plot = "your_plot_object", + width = 11, + height = 8) +``` +{{% /codeblock %}} + +### Directory Structure with ggsave() +While the visualizations are now clear, version controlled and therefore unique, saving all visualizations could create a cluttered folder or directory. Therefore, organizing your plot files into directories maintains a clean and navigable project structure. `ggsave()` facilitates this by allowing you to specify the path where the file should be saved. Moreover, with the `create.dir` argument, it can create new directories directly from the function. + + +#### Path Specification +Properly structuring your directories to mirror the content or analysis phase improves your workflow. Therefore, it's a good practice to organize your files into directories that reflect the content or the stage of your analysis. In `ggsave()` you can specify the `path` to the directory where you want your plot saved, categorizing your files: + +For example: + +{{% codeblock %}} + +```R +# Saving a plot to a specific directory +ggsave( + filename = paste0("scatterplot_gdp_vs_life_expectancy", format(Sys.Date(), "%Y-%m-%d"), ".pdf"), + plot = "your_plot_object", + path = "my_project/analysis/figures/scatterplots" + width = 11, + height = 8) +``` + +{{% /codeblock %}} + +#### Automated Directory Handling +To further improve file management, `ggsave()` offers the capability to create directories if they don't already exist. +This function is especially useful for ensuring that your desired file structure is adhered to without requiring manual directory setup. Specify within `ggsave()`, `create.dir = TRUE`, to utilize this feature. + +{{% codeblock %}} +```R +# Automatically creating directories if they don't exist +ggsave( + filename = paste0("scatterplot_gdp_vs_life_expectancy", format(Sys.Date(), "%Y-%m-%d"), ".pdf"), + plot = "your_plot_object", + path = "my_project/analysis/figures/scatterplots" + width = 11, + height = 8, + create.dir = TRUE) +``` +{{% /codeblock %}} + +{{% summary %}} +This article covers techniques for saving R plots using the `ggsave()` function from the `ggplot2` package, covering: +- Transitioning from base R's device-based plot saving to the more versatile `ggsave()`. +- Using dynamic file naming and conventions within `ggsave()` for clear, searchable plot filenames. +- Version control with `ggsave()`, utilizing timestamps and version numbers for unique plot identification. +- Directory management, specifying paths and auto-creating directories to keep projects organized. + +The article includes practical examples at each step, ready-to-use code snippets, and best practices tips, aimed at improving project's organization and efficiency in managing visual outputs. + +{{% /summary %}} diff --git a/content/topics/Visualization/data-visualization/graphs-charts/stata-graphs.md b/content/topics/Visualization/data-visualization/graphs-charts/stata-graphs.md index c2c247127..d82e82248 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/stata-graphs.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/stata-graphs.md @@ -2,7 +2,7 @@ title: "Stata Graphs Made Easy" description: "Learn how to quickly and efficiently prepare graphs in Stata." keywords: "stata, graphs, data visualization, graph, data, twoway, command, dataset, example, tutorial" -weight: 4 +weight: 5 date: 2022-06-20T22:02:51+05:30 draft: false aliases: diff --git a/content/topics/Visualization/data-visualization/graphs-charts/statistical-testing-inside-ggplot.md b/content/topics/Visualization/data-visualization/graphs-charts/statistical-testing-inside-ggplot.md index 55581d87c..8ab83c53c 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/statistical-testing-inside-ggplot.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/statistical-testing-inside-ggplot.md @@ -3,7 +3,7 @@ title: "Combining Statistical Testing and Visualization, directly in R using ggp description: "Researchers often report the outcome of analyses (e.g., a statistical test) in visualizations (e.g., a figure in a paper). We show you how to automate this using `ggpubr`'s visualization tools and `rstatix`'s features in R." keywords: "ggplot2, bar chart, `ggpubr`, categorical variable, dplyr, `rstatix`, data visualization" date: 11-12-2023 -weight: 5 +weight: 8 author: "Matthijs ten Tije" authorlink: "https://tilburgsciencehub.com/contributors/matthijstentije/" aliases: diff --git a/content/topics/Visualization/data-visualization/graphs-charts/styling-bar-charts-in-ggplot2.md b/content/topics/Visualization/data-visualization/graphs-charts/styling-bar-charts-in-ggplot2.md index 95cf1d72f..29407bb6c 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/styling-bar-charts-in-ggplot2.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/styling-bar-charts-in-ggplot2.md @@ -2,7 +2,7 @@ title: "Styling Bar Charts in ggplot2" description: "Effective data visualization balances accuracy and aesthetics. The R package `ggplot2`, while versatile, can be challenging for custom styling. This guide covers color, theme, and label customization, with styles for various data scenarios, especially academic papers. It includes using standard errors/error bars, black-and-white formatting, data grouping, and saving visuals as high-quality PNG or PDF for publication." date: 11-12-2023 -weight: 4 +weight: 7 author: "Matthijs ten Tije" authorlink: "https://tilburgsciencehub.com/contributors/matthijstentije/" aliases: @@ -258,37 +258,13 @@ In this code: Remember, the positions and labels are adjustable based on your specific data and results. Experiment with the x and y values in the annotate function to achieve the best placement in your bar chart. This approach provides a clear, customized way to denote significant findings in your visualization. -## Saving Your Plots -`ggsave()` is an essential function in R, primarily used in conjunction with the ggplot2 package. The key role of ggsave() is to facilitate the saving of these ggplot2-generated plots into various file formats such as JPEG, PNG, PDF, and SVG, making it a versatile tool in data visualization. - -The main purpose of ggsave is to provide a straightforward method for saving ggplot2-generated plots. - -The function follows the following syntax: - -{{% codeblock %}} -```R -ggsave(filename, plot = last_plot(), device = NULL, path = NULL, scale = 1, width = NA, height = NA, dpi = 300, limitsize = TRUE, ...). -``` -{{% /codeblock %}} - -**Key Parameters:** -- filename: Specifies the desired name for the saved file. -- plot: Indicates the ggplot object to be saved. If omitted, the function saves the last displayed plot. -- device: Determines the output file format (e.g., PNG, PDF). -- width, height, dpi: These parameters control the dimensions and resolution of the saved plot, allowing for customization of the output size and quality. +{{% tip %}} -### Practical example +_Saving Your Plots_ -{{% codeblock %}} -```R -plotExample <- ggplot(mpg, aes(displ, hwy)) + geom_point() -plotExample # This command displays the plot. -Using ggsave to Save the Plot: -ggsave("my_plot.png", plotExample, width = 10, height = 8, dpi = 300) -``` -{{% /codeblock %}} +Transferring plots via copy-paste can degrade their quality, but our article on about saving plots in R provides a solution. It discusses how to use use `ggsave()` from the `ggplot2` package to save your visuals in high-quality formats. [Discover](/ggplot2/ggsave) how to maintain clarity in presentations and publications by customizing plot dimensions and resolutions. -This command saves the plotExample as a PNG file named "my_plot.png", with specified dimensions of 10 inches in width and 8 inches in height, and a resolution of 300 dpi. +{{% /tip %}} ## Advanced Techniques for Multi-Group Bar Charts in `ggplot2` @@ -367,8 +343,8 @@ This article uses `ggplot2` for effective bar chart styling, crucial in academic - Bar charts in `ggplot2` are ideal for categorical data, showcasing groups and their quantitative measures. - Key functions covered include `ggplot()` for initial plot creation and `geom_col()` for constructing bar charts. - Advanced customization is achieved using `geom_errorbar() `for error bars and `scale_fill_manual()` for color themes. -- Showcases how to add p-values inside your ggplot. -- Uses `ggsave()`, demonstrating how to save the final plots in publication-ready formats like PNG or PDF. +- Showcases how to add p-values inside your `ggplot`. + Interested in the source code used in this analysis? Download it [here](source-code-barchart-visualization.R). diff --git a/content/topics/Visualization/data-visualization/graphs-charts/theory-best-practices.md b/content/topics/Visualization/data-visualization/graphs-charts/theory-best-practices.md index b6f5c25cd..bbe1c5cc7 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/theory-best-practices.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/theory-best-practices.md @@ -362,6 +362,11 @@ ggsave("my_box_plot.png", plot = box_plot, width = 8, height = 6, dpi = 600) ``` {{% /codeblock %}} +{{% tip %}} + +For a more in-depth exploration of the syntax and capabilities of `ggsave()`, check out this [article](/ggplot2/ggsave). + +{{% /tip %}} ## Summary Data visualization is essential for understanding data. It uses marks (like points and lines) and channels (such as color and size) to create charts. A complete chart includes axes, legends, titles, and labels for clarity. diff --git a/content/topics/Visualization/data-visualization/graphs-charts/time-series-ggplot2.md b/content/topics/Visualization/data-visualization/graphs-charts/time-series-ggplot2.md index a2f72203a..9e565848d 100644 --- a/content/topics/Visualization/data-visualization/graphs-charts/time-series-ggplot2.md +++ b/content/topics/Visualization/data-visualization/graphs-charts/time-series-ggplot2.md @@ -3,7 +3,7 @@ title: "Visualizing Time Series Data with ggplot2" description: "Explore the use of ggplot2 in visualizing time series data, from basic plotting techniques to advanced customization, using the tidyquant package for financial data analysis." keywords: "ggplot2, time series visualization, R, tidyquant, data visualization, scale formatting, grouping, faceting" date: 2024-02-14 -weight: +weight: 6 author: "Matthijs ten Tije" authorlink: "https://tilburgsciencehub.com/contributors/matthijstentije/" aliases: @@ -411,27 +411,3 @@ This article demonstrate how to use `ggplot2` for time series data visualization [R-link](time-series-ggplot2.Rmd) {{% /codeblock %}} - -{{% tip %}} - -#### Tips for Saving Plots Efficiently in R - -- **Path Verification**: Confirm the save path exists to avoid errors. -- **Streamlined Saving**: For quick saves, use the current working directory by omitting the path, only specifying the file name. Check your directory with getwd(). -- **Format Specification**: Although ggsave() guesses the format from the file extension, explicitly define the file type with the type parameter for precision. -- **Dimension Control**: Adjust width, height, and dpi for optimal clarity, especially critical for high-stakes presentations or publications. Units default to inches but can be set to centimeters. -- **Automated Naming**: Employ dynamic naming for your files, like incorporating timestamps, to enhance file management and prevent overwrites, fostering a more organized workflow. - -{{% /tip %}} - -{{% codeblock %}} -```R -# Example: Saving a plot to the current working directory with specified dimensions and DPI -ggsave(filename = "Your-Figure-Title.png", width = 10, height = 6, dpi = 300) - -# Example: Saving multiple plots using automatic file naming -timestamp <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") -filename <- paste0("plot_", timestamp, ".png") -ggsave(filename, width = 8, height = 5, dpi = 300) -``` -{{% /codeblock %}} diff --git a/static/contributors/roshinisudhaharan.webp b/static/contributors/roshinisudhaharan.webp new file mode 100644 index 000000000..d92149b7a Binary files /dev/null and b/static/contributors/roshinisudhaharan.webp differ diff --git a/themes/tilburg/layouts/contributors/single.html b/themes/tilburg/layouts/contributors/single.html index 30e5043f5..cc37d0d5e 100644 --- a/themes/tilburg/layouts/contributors/single.html +++ b/themes/tilburg/layouts/contributors/single.html @@ -29,23 +29,37 @@

{{ .Params.name }}

{{ $contributions := slice }} - {{ range where .Site.Pages "Section" "topics" }} + {{ range where .Site.Pages "Section" "topics" }} {{ if ne .Title "topics" }} {{ if isset .Params "author" }} - {{ if eq .Params.Author $.Params.name }} - {{ $contributions = $contributions | append . }} + {{ $names := split .Params.Author ", " }} + {{ $match := false }} + {{ range $names }} + {{ if eq . $.Params.name }} + {{ $match = true }} + {{ end }} + {{ end }} + {{ if $match }} + {{ $contributions = $contributions | append . }} + {{ end }} {{ end }} {{ end }} {{ end }} - {{ end }} - - {{ range where .Site.Pages "Section" "examples" }} + + {{ range where .Site.Pages "Section" "examples" }} {{ if ne .Title "examples" }} {{ if isset .Params "author" }} - {{ if eq .Params.Author $.Params.name }} - {{ $contributions = $contributions | append . }} + {{ $names := split .Params.Author ", " }} + {{ $match := false }} + {{ range $names }} + {{ if eq . $.Params.name }} + {{ $match = true }} + {{ end }} {{ end }} + {{ if $match }} + {{ $contributions = $contributions | append . }} + {{ end }} {{ end }} {{ end }} {{ end }}