-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataExtraction.py
54 lines (44 loc) · 1.69 KB
/
dataExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import kaggle
import os
import pandas as pd
# Set Kaggle API credentials directory
os.environ['KAGGLE_CONFIG_DIR'] = 'C:/Users/tsop/.kaggle' # Update this path to your Kaggle configuration directory
# Specify the dataset identifier
dataset = 'piterfm/paris-2024-olympic-summer-games'
# Set the download path
download_path = 'D:/Programming/pydir/tutorials/olympics_dashboard/data' # Change this to your preferred download directory
# Remove existing files in the folder to prevent duplicates or outdated files
for file in os.listdir(download_path):
file_path = os.path.join(download_path, file)
try:
if os.path.isfile(file_path):
os.unlink(file_path) # Delete the file
print(f"Deleted {file_path}")
except Exception as e:
print(f"Error deleting {file_path}: {e}")
# Download the dataset using the Kaggle API and unzip the files
kaggle.api.dataset_download_files(dataset, path=download_path, unzip=True)
# List of CSV files to be imported
csv_files = [
'athletes.csv',
'events.csv',
'medallists.csv',
'medals.csv',
'medals_total.csv',
'schedules.csv',
'schedules_preliminary.csv',
'teams.csv',
'torch_route.csv',
'venues.csv'
]
# Initialize a dictionary to hold DataFrames
dataframes = {}
# Iterate through each CSV file and load it into a DataFrame
for file in csv_files:
# Construct the full path to the CSV file
file_path = os.path.join(download_path, file)
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)
# Add the DataFrame to the dictionary using the file name as the key
table_name = file.split('.')[0] # Remove the .csv extension
dataframes[table_name] = df