This repository has been archived by the owner on Jun 25, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiso2csv.py
executable file
·116 lines (86 loc) · 2.54 KB
/
iso2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import csv
import re
import urllib2
from urllib2 import urlopen
from bs4 import BeautifulSoup
portalMetadata = []
f = csv.writer(open('juneaue-output.csv', 'w'))
f.writerow(['Identifier','Title','Description','Originator','Date_Issued','Topic','West','East','North','South','Format','links','keywords'])
with open('juneau.csv','r') as harvest:
urls = csv.reader(harvest)
for url in urls:
portalMetadata.append(url)
for url in portalMetadata:
page = urlopen(url[0]).read()
soup = BeautifulSoup(page, "xml")
idField = soup.find('fileIdentifier')
titleField = soup.find('title')
abstractField = soup.find('abstract')
originField = soup.find('credit')
pubdateField = soup.find('date')
topicField = soup.find('MD_TopicCategoryCode')
westField = soup.find('westBoundLongitude')
eastField = soup.find('eastBoundLongitude')
northField = soup.find('northBoundLatitude')
southField = soup.find('southBoundLatitude')
formatField = soup.find('MD_Format')
links = []
for word in soup.select('URL'):
links.append(word.contents[0])
keywords = soup.find_all('keyword')
# keywords = []
# for word in soup.select("keyword")[0].get_text():
# keywords.append(word.contents[0])
try:
scraped_id = idField.text.strip()
except:
scraped_id = "undefined"
try:
scraped_title = titleField.text.strip()
except:
scraped_title = "undefined"
try:
scraped_abstract = abstractField.text.encode('utf-8').strip()
except:
scraped_abstract = "undefined"
try:
scraped_origin = originField.text.encode('utf-8').strip()
except:
scraped_origin = "undefined"
try:
scraped_pubdate = pubdateField.text.encode('utf-8').strip()
except:
scraped_pubdate = "undefined"
try:
scraped_topic = topicField.text.encode('utf-8').strip()
except:
scraped_topic = "undefined"
try:
scraped_west = westField.text.strip()
except:
scraped_west = "undefined"
try:
scraped_east = eastField.text.strip()
except:
scraped_east = "undefined"
try:
scraped_north = northField.text.strip()
except:
scraped_north = "undefined"
try:
scraped_south = southField.text.strip()
except:
scraped_south = "undefined"
try:
scraped_format = formatField.text.strip()
except:
scraped_format = "undefined"
try:
scraped_link = links
except:
scraped_link = "undefined"
try:
scraped_keywords = keywords
except:
scraped_keywords = "undefined"
f.writerow([scraped_id,scraped_title,scraped_abstract,scraped_origin,scraped_pubdate,scraped_topic,scraped_west,scraped_east,scraped_north,scraped_south,scraped_format,scraped_link,scraped_keywords])