-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_all.py
110 lines (79 loc) · 3.51 KB
/
scrape_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/python3.7
'''
Run this file to gather all scraping results for
- indeed.com
- monster.com
Usage:
python3 scrape_all.py <"job search term"> <"location">
(quotes are required)
Example:
python3 scrape_all.py --searchterm "computer science" --location 'san jose'
Where:
<location> can be a string as a city and/or state (San jose, CA) or an int as a zip code (95050)
'''
import os
import subprocess
import argparse
import random
import csv
# Get command line arguments
parser = argparse.ArgumentParser(description='Scrape from all data sources and update their .csv files.\n')
parser.add_argument("--searchterm", default=None, type=str, required=True, help="The job search term, such as Biology or Computer Science. Place in quotes")
parser.add_argument("--location", default=None, type=str, required=True, help="Specify the job search's location. It can be a string as a city and/or state (San jose, CA) or a zip code (95050). Place in quotes")
# Command line arguments are placed into variables and these will act as the job search term
# These will be concatenated into the scraper command line arguments to scrape data for that search
args = parser.parse_args()
search_term = args.searchterm
location = args.location
# Initialize path for scrapers
# Get current working directory
cwd = os.path.dirname(os.path.realpath(__file__))
indeed_path = (cwd + '/scrapers/indeed_com/indeed.py')
monster_path = (cwd + '/scrapers/monster_com/monster.py')
indeed_csv_path = (cwd + '/front-end/src/csv/indeed_jobs.csv')
monster_csv_path = (cwd + '/front-end/src/csv/monster_jobs.csv')
allJobs_csv_path = (cwd + '/front-end/src/csv/allJobs.csv') # Where both .csv files will be combined
# Gain elevated user permissions to call files
subprocess.Popen('chmod +x ' + indeed_path, shell=True)
subprocess.Popen('chmod +x ' + monster_path, shell=True)
print('Scraping all data now...\n')
# Add quotes to the string
quotes = '"'
# Run shell prompt to run the python files
os.system('{} {} {} {} {} {} {} {} {} {} {}'.format('python3', indeed_path, ' --searchterm ', " ", quotes, search_term, quotes, ' --location ', quotes, location, quotes))
print('\n')
os.system('{} {} {} {} {} {} {} {} {} {} {}'.format('python3', monster_path, ' --searchterm ', " ", quotes, search_term, quotes, ' --location ', quotes, location, quotes))
print('\n')
# Combine both jobs.csv files
print('Combining both jobs.csv files now...\n')
reader = csv.reader(open(indeed_csv_path))
reader1 = csv.reader(open(monster_csv_path))
f = open(allJobs_csv_path, "w")
writer = csv.writer(f)
for row in reader:
writer.writerow(row)
for row in reader1:
writer.writerow(row)
f.close()
# Randomize both CSV files' lines
print('Randomizing lines of jobs.csv now...\n')
with open(allJobs_csv_path,'r') as source:
data = [ (random.random(), line) for line in source ]
data.sort()
with open(allJobs_csv_path,'w') as target:
for _, line in data:
target.write( line )
# Re-write header to randomize
header = "Title,Company,Location,Link,JobSource\n"
with open(allJobs_csv_path) as f1, open(indeed_csv_path) as f2, open(monster_csv_path) as f3:
lines_f1 = f1.readlines() #read
lines_f2 = f2.readlines() #read
lines_f3 = f3.readlines() #read
lines_f1[0] = header # Write to line 1
lines_f2[0] = header # Write to line 1
lines_f3[0] = header # Write to line 1
with open(allJobs_csv_path, "w") as f1, open(indeed_csv_path, "w") as f2, open(monster_csv_path, "w") as f3:
# Write back
f1.writelines(lines_f1)
f2.writelines(lines_f2)
f3.writelines(lines_f3)