-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_data_merge.py
51 lines (38 loc) · 1.64 KB
/
1_data_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-
"""
Created on Sat May 18 17:13:49 2019
@author: Alex
"""
# Import packages
import pandas as pd
import numpy as np
import os
import glob
os.chdir('D:/Users/Alex/Git_Repositories/Thesis/SIPP_Data')
# Create list of filenmaes
datafiles = glob.glob('*.asc')
# Read and concatenate datafiles
SIPP_addendum = pd.concat(pd.read_csv(f, na_values = '-1') for f in datafiles)
# Drop extra column
SIPP_addendum.drop(columns = 'RPYPER1', inplace = True)
SIPP_addendum.rename(columns = {'SSUID': 'ssuid', 'SPANEL': 'spanel', 'SWAVE': 'swave',
'SREFMON': 'srefmon', 'EPPPNUM': 'epppnum'}, inplace = True)
# Read base data
SIPP_base = pd.read_stata('Byker_Files/SIPP_Paid_Leave.dta')
# Reassign datatypes
#SIPP_addendum['ssuid'] = SIPP_addendum['ssuid'].astype('object')
#SIPP_addendum['spanel'] = SIPP_addendum['spanel'].astype('int16')
#SIPP_addendum['swave'] = SIPP_addendum['swave'].astype('int8')
#SIPP_addendum['srefmon'] = SIPP_addendum['srefmon'].astype('int8')
#SIPP_addendum['epppnum'] = SIPP_addendum['epppnum'].astype('int16')
SIPP_base['ssuid'] = SIPP_base['ssuid'].astype('int64')
SIPP_base['spanel'] = SIPP_base['spanel'].astype('int64')
SIPP_base['swave'] = SIPP_base['swave'].astype('int64')
SIPP_base['srefmon'] = SIPP_base['srefmon'].astype('int64')
SIPP_base['epppnum'] = SIPP_base['epppnum'].astype('int64')
SIPP_addendum.loc[:,['ssuid', 'spanel', 'swave', 'srefmon', 'epppnum']].head()
# Merge dataframes, keep intersection
df = pd.merge(SIPP_base, SIPP_addendum, on = ['ssuid', 'epppnum', 'spanel', 'swave', 'srefmon'],
validate = '1:1')
# Save dataframe to pickle
df.to_pickle('SIPP_Dataset')