-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcolumn_concat.py
90 lines (68 loc) · 3.85 KB
/
column_concat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# For the CSV files google_dataset.csv and facebook_dataset.csv, the number of columns can vary.
# For the Google dataset, if a row has more columns than the header, we concatenate the 14th column with the 13th column
# until the total number of columns is equal to the number of columns in the header. This way, we capture the information
# from the text column that was split into multiple columns. This is a better option than removing the row altogether.
# For the Facebook dataset, if a row has at least 9 columns, we concatenate the 8th column with the 7th column until the
# 9th column starts with "http". This way, we ensure that the description is intact and we use the URL as a delimiter.
# To prevent errors, we discard rows that do not meet the conditions and we check if the number of columns at the
# end of the algorithm is less than or equal to the number of columns in the header.
import pandas as pd
import csv
def modify_csv(file_path, output_path):
# We open the input and output CSV files
with open(file_path, mode='r', newline='', encoding='utf-8') as infile, \
open(output_path, mode='w', newline='', encoding='utf-8') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
# Read the header row
header = next(reader)
num_header_columns = len(header)
# We write the header to the output file
writer.writerow(header)
# We process each row
for row in reader:
# If the row has more columns than the header, we need to concatenate.
if len(row) > num_header_columns:
# We concatenate from the 14th column onward to the 13th column.
row[12] = ' '.join(row[12:len(row)]) # We join all extra columns into the 13th column.
row = row[:num_header_columns] # We keep only up to the number of columns in the header.
# We write the modified row to the output file.
writer.writerow(row)
def modify_csv_condition(file_path, output_path):
# We open the input and output CSV files
with open(file_path, mode='r', newline='', encoding='utf-8') as infile, \
open(output_path, mode='w', newline='', encoding='utf-8') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
# We read the header row
header = next(reader)
num_header_columns = len(header)
# We write the header to the output file
writer.writerow(header)
# We process each row
for row in reader:
# We check if there are at least 9 columns, if not, skip the row
if len(row) < 9:
continue
# We keep concatenating the 8th column into the 7th until the 9th column starts with "http"
while len(row) > 8:
if row[8].startswith("http") and len(row) <= num_header_columns:
break
# We concatenate the 8th column to the 7th column
row[6] = f"{row[6]} {row[7]}" # Concatenate the 7th and 8th columns
# We remove the 8th column (after concatenating) by popping it out
row.pop(7)
# If the row now has fewer than 9 columns, we break the loop and discard it
if len(row) < 9:
row = None
break
# If the row was discarded, we continue to the next row
if row is None:
continue
# If after all concatenation, the 9th column still doesn't start with "http", we discard the row
if not row[8].startswith("http"):
continue
# We write the modified row to the output file
writer.writerow(row)
modify_csv('google_dataset.csv', 'google_dataset_modified.csv')
modify_csv_condition('facebook_dataset.csv', 'facebook_dataset_modified.csv')