-
Notifications
You must be signed in to change notification settings - Fork 0
/
initial_analysis.R
84 lines (62 loc) · 3.18 KB
/
initial_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
setwd("~/AstroNet Projects/Proiecte Personale/Junior Data Engineer Assignment")
# We create data frames for all three csv files
df_fb <- read.csv("facebook_dataset_modified.csv")
df_google <- read.csv("google_dataset_modified.csv")
# For the website data frame, the separator is a semicolon
df_website <- read.csv("website_dataset.csv", sep = ";")
# PREPROCESSING ----
# For each data frame, identify all the column names
colnames(df_fb)
colnames(df_google)
colnames(df_website)
# All the columns that should remain are: "domain", "name", "categories",
# "phone", "country_name", "city_name", "region_name", "address"
# I chose these columns because all of them can be found in at least two
# of the 3 datasets and they convey the most relevant information.
# In order to standardize the data, we need to choose our columns carefully
# and rename them to be sure that they function correctly when we join them.
# For df_fb:
# Rename "city" to "city_name"
colnames(df_fb)[colnames(df_fb) == "city"] <- "city_name"
# For df_google:
# Rename "category" to "categories"
colnames(df_google)[colnames(df_google) == "category"] <- "categories"
# Rename "city" to "city_name"
colnames(df_google)[colnames(df_google) == "city"] <- "city_name"
# For df_website:
# Rename "root_domain" to "domain"
colnames(df_website)[colnames(df_website) == "root_domain"] <- "domain"
# Rename "legal_name" to "name"
colnames(df_website)[colnames(df_website) == "legal_name"] <- "name"
# Rename "category" to "categories"
colnames(df_website)[colnames(df_website) == "s_category"] <- "categories"
# Rename "main_city" to "city_name"
colnames(df_website)[colnames(df_website) == "main_city"] <- "city_name"
# Rename "main_country" to "country_name"
colnames(df_website)[colnames(df_website) == "main_country"] <- "country_name"
# Rename "main_region" to "region_name"
colnames(df_website)[colnames(df_website) == "main_region"] <- "region_name"
# Add the "address" column to the website data frame
df_website$address <- NA
# The address column will be null for the data coming from the website, but
# this is a necessary drawback in order to increase final dataset size.
# ASSEMBLING THE FINAL DATASET ----
# The biggest dataset is google, so the resulting data frame will be based on
# the google data frame.
# We extract the columns that are common to all three data frames
df <- df_google[,c("domain", "name", "categories", "phone", "country_name",
"city_name", "region_name", "address")]
# We append the rows from the facebook and website data frames that are not
# already in the resulting data frame
df <- rbind(df, df_fb[,c("domain", "name", "categories", "phone",
"country_name", "city_name",
"region_name", "address")])
df <- rbind(df, df_website[,c("domain", "name", "categories", "phone",
"country_name", "city_name", "region_name",
"address")])
# We remove potential duplicates
df <- unique(df)
# If a row has more than 4 missing values, we remove it for lack of information.
df <- df[rowSums(is.na(df)) <= 4,]
# Output the final data frame to a csv file
write.csv(df, "final_dataset.csv", row.names = FALSE)