Credit: Dung Lai
You can view his course here: https://dunglailaptrinh.com/L-p-H-c-Data-Science-C-B-n-Python-c735d90b891a4351b658fff8d8cab589
- dfds
- ádasd
- dsasd
- Used FileIO to read data.txt
- Used basic techniques to extract clean data
- Visualized the following insights:
- Top first names of candidates
- Top last names of candidates
- Avarage scores of 11 groups of ages
- Average scores of candidates who did not sit for some subjects
- Numbers of candidates who did not sit for some subjects
import csv
file = open("raw_data.txt", "r")
datas = file.read().split("\n")
with open("clean_data.csv", "w", encoding="utf8", newline ="") as file_csv:
header = ["sbd", "tên", "dd", "mm", "yy", "toán", "ngữ văn", "khxh", "khtn", "lịch sử", "địa lí", "gdcd", "sinh học", "vật lí", "hóa học", "tiếng anh"]
writer = csv.writer(file_csv)
writer.writerow(header)
sbd = 2000000
for data in datas:
sbd += 1
if sbd in
continue
sbd_str = "0" + str(sbd)
data = data.split("\\n")
for i in range(len(data)):
data[i] = data[i].replace("\\r","")
data[i] = data[i].replace("\\t","")
for i in range(len(data)):
tags = []
for j in range(len(data[i])):
if data[i][j] == "<":
begin = j
if data[i][j] == ">":
end = j
tags.append(data[i][begin:end+1])
for tag in tags:
data[i] = data[i].replace(tag,"")
for i in range(len(data)):
data[i] = data[i].strip()
emty_line = []
for i in range(len(data)):
if data[i] != "":
emty_line.append(data[i])
data = emty_line
name = data[7]
dob = data[8]
scores = data[9]
chars = []
codes = []
file = open("unicode.txt", "r",encoding="utf8")
unicode_table = file.read().split("\n")
for code in unicode_table:
x = code.split(" ")
chars.append(x[0])
codes.append(x[1])
for i in range(len(chars)):
name = name.replace(codes[i],chars[i])
scores = scores.replace(codes[i],chars[i])
for i in range(len(name)):
if name[i:i+2] == "&#":
name = name[:i] +chr(int(name[i+2:i+5])) + name[i+6:]
for i in range(len(scores)):
if scores[i:i+2] == "&#":
scores = scores[:i] +chr(int(scores[i+2:i+5])) + scores[i+6:]
name = name.lower()
scores = scores.lower()
dob_list = dob.split("/")
dd = int(dob_list[0])
mm = int(dob_list[1])
yy = int(dob_list[2])
scores = scores.replace(":", "")
scores = scores.replace("khxh ", "khxh ")
scores = scores.replace("khtn ", "khtn ")
scores_list = scores.split(" ")
data = [sbd_str,name.title(),str(dd), str(mm), str(yy)]
for subject in ["toán", "ngữ văn", "khxh", "khtn", "lịch sử", "địa lí", "gdcd", "sinh học", "vật lí", "hóa học", "tiếng anh"]:
if subject in scores_list:
subject_name_position = scores_list.index(subject)
subject_score_position = subject_name_position + 1
subject_score = scores_list[subject_score_position]
data.append(str(subject_score))
else:
data.append("-1")
with open("clean_data.csv", "a", encoding="utf8",newline ="") as file_csv:
writer = csv.writer(file_csv)
writer.writerow(data)
with open("clean_data.csv", "a", encoding="utf8",newline ="") as file_csv:
writer = csv.writer(file_csv)
writer.writerow(data)