-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathprepare-documents.py
39 lines (25 loc) · 1.3 KB
/
prepare-documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import numpy as np
import os
from util.meta import cache_dir, input_dir
def fix_date(d):
if type(d) in [str, unicode] and d.startswith('30'):
d = d.replace('30', '20', 1)
elif type(d) in [str, unicode] and d.startswith('00'):
d = d.replace('00', '20', 1)
return d
def encode_feature(values):
uniq = values.unique()
mapping = dict(zip(uniq, range(1, len(uniq) + 1)))
return values.map(mapping)
df = pd.read_csv(os.path.join(input_dir, 'documents_meta.csv.zip'), index_col='document_id', dtype={'document_id': np.uint32})
df['source_id'] = df['source_id'].fillna(-1).astype(np.int32)
df['publisher_id'] = df['publisher_id'].fillna(-1).astype(np.int16)
df['publish_time'] = pd.to_datetime(df['publish_time'].map(fix_date).replace('nan', np.nan), errors='coerce')
df['publish_timestamp'] = (df['publish_time'].astype(np.int64) // 1000000 - 1465876799998).clip(lower=-1000000000000)
df.to_csv(os.path.join(cache_dir, 'documents.csv.gz'), compression='gzip')
## Document entities
df = pd.read_csv(os.path.join(input_dir, 'documents_entities.csv.zip'), index_col='document_id', dtype={'document_id': np.uint32})
df['entity_id'] = encode_feature(df['entity_id'])
df.to_csv(os.path.join(cache_dir, 'documents_entities.csv.gz'), compression='gzip')
print "Done."