-
Notifications
You must be signed in to change notification settings - Fork 1
/
process-new.py
executable file
·125 lines (112 loc) · 12.2 KB
/
process-new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
from lxml import etree
import os, glob, re, datetime
def lenTest(path,tree):
return len(tree.xpath(path))
def cleanerApp(content):
# Because there are various issues coming through from the unicode stuff, so let's start cleaning this up.
rdict = {'\xc3\x80':'\u00c0', '\xc3\x81':'\u00c1', '\xc3\x82':'\u00c2', '\xc3\x83':'\u00c3', '\xc3\x84':'\u00c4', '\xc3\x85':'\u00c5', '\xc3\x86':'\u00c6', '\xc3\x87':'\u00c7', '\xc3\x88':'\u00c8', '\xc3\x89':'\u00c9', '\xc3\x8a':'\u00ca', '\xc3\x8b':'\u00cb', '\xc3\x8c':'\u00cc', '\xc3\x8d':'\u00cd', '\xc3\x8e':'\u00ce', '\xc3\x8f':'\u00cf', '\xc3\x90':'\u00d0', '\xc3\x91':'\u00d1', '\xc3\x92':'\u00d2', '\xc3\x93':'\u00d3', '\xc3\x94':'\u00d4', '\xc3\x95':'\u00d5', '\xc3\x96':'\u00d6', '\xc3\x97':'\u00d7', '\xc3\x98':'\u00d8', '\xc3\x99':'\u00d9', '\xc3\x9a':'\u00da', '\xc3\x9b':'\u00db', '\xc3\x9c':'\u00dc', '\xc3\x9d':'\u00dd', '\xc3\x9e':'\u00de', '\xc3\x9f':'\u00df', '\xc3\xa0':'\u00e0', '\xc3\xa1':'\u00e1', '\xc3\xa2':'\u00e2', '\xc3\xa3':'\u00e3', '\xc3\xa4':'\u00e4', '\xc3\xa5':'\u00e5', '\xc3\xa6':'\u00e6', '\xc3\xa7':'\u00e7', '\xc3\xa8':'\u00e8', '\xc3\xa9':'\u00e9', '\xc3\xaa':'\u00ea', '\xc3\xab':'\u00eb', '\xc3\xac':'\u00ec', '\xc3\xad':'\u00ed', '\xc3\xae':'\u00ee', '\xc3\xaf':'\u00ef', '\xc3\xb0':'\u00f0', '\xc3\xb1':'\u00f1', '\xc3\xb2':'\u00f2', '\xc3\xb3':'\u00f3', '\xc3\xb4':'\u00f4', '\xc3\xb5':'\u00f5', '\xc3\xb6':'\u00f6', '\xc3\xb7':'\u00f7', '\xc3\xb8':'\u00f8', '\xc3\xb9':'\u00f9', '\xc3\xba':'\u00fa', '\xc3\xbb':'\u00fb', '\xc3\xbc':'\u00fc', '\xc3\xbd':'\u00fd', '\xc3\xbe':'\u00fe', '\xc3\xbf':'\u00ff', '\xc4\x80':'\u0100', '\xc4\x81':'\u0101', '\xc4\x82':'\u0102', '\xc4\x83':'\u0103', '\xc4\x84':'\u0104', '\xc4\x85':'\u0105', '\xc4\x86':'\u0106', '\xc4\x87':'\u0107', '\xc4\x88':'\u0108', '\xc4\x89':'\u0109', '\xc4\x8a':'\u010a', '\xc4\x8b':'\u010b', '\xc4\x8c':'\u010c', '\xc4\x8d':'\u010d', '\xc4\x8e':'\u010e', '\xc4\x8f':'\u010f', '\xc4\x90':'\u0110', '\xc4\x91':'\u0111', '\xc4\x92':'\u0112', '\xc4\x93':'\u0113', '\xc4\x94':'\u0114', '\xc4\x95':'\u0115', '\xc4\x96':'\u0116', '\xc4\x97':'\u0117', '\xc4\x98':'\u0118', '\xc4\x99':'\u0119', '\xc4\x9a':'\u011a', '\xc4\x9b':'\u011b', '\xc4\x9c':'\u011c', '\xc4\x9d':'\u011d', '\xc4\x9e':'\u011e', '\xc4\x9f':'\u011f', '\xc4\xa0':'\u0120', '\xc4\xa1':'\u0121', '\xc4\xa2':'\u0122', '\xc4\xa3':'\u0123', '\xc4\xa4':'\u0124', '\xc4\xa5':'\u0125', '\xc4\xa6':'\u0126', '\xc4\xa7':'\u0127', '\xc4\xa8':'\u0128', '\xc4\xa9':'\u0129', '\xc4\xaa':'\u012a', '\xc4\xab':'\u012b', '\xc4\xac':'\u012c', '\xc4\xad':'\u012d', '\xc4\xae':'\u012e', '\xc4\xaf':'\u012f', '\xc4\xb0':'\u0130', '\xc4\xb1':'\u0131', '\xc4\xb2':'\u0132', '\xc4\xb3':'\u0133', '\xc4\xb4':'\u0134', '\xc4\xb5':'\u0135', '\xc4\xb6':'\u0136', '\xc4\xb7':'\u0137', '\xc4\xb8':'\u0138', '\xc4\xb9':'\u0139', '\xc4\xba':'\u013a', '\xc4\xbb':'\u013b', '\xc4\xbc':'\u013c', '\xc4\xbd':'\u013d', '\xc4\xbe':'\u013e', '\xc4\xbf':'\u013f', '\xc5\x80':'\u0140', '\xc5\x81':'\u0141', '\xc5\x82':'\u0142', '\xc5\x83':'\u0143', '\xc5\x84':'\u0144', '\xc5\x85':'\u0145', '\xc5\x86':'\u0146', '\xc5\x87':'\u0147', '\xc5\x88':'\u0148', '\xc5\x89':'\u0149', '\xc5\x8a':'\u014a', '\xc5\x8b':'\u014b', '\xc5\x8c':'\u014c', '\xc5\x8d':'\u014d', '\xc5\x8e':'\u014e', '\xc5\x8f':'\u014f', '\xc5\x90':'\u0150', '\xc5\x91':'\u0151', '\xc5\x92':'\u0152', '\xc5\x93':'\u0153', '\xc5\x94':'\u0154', '\xc5\x95':'\u0155', '\xc5\x96':'\u0156', '\xc5\x97':'\u0157', '\xc5\x98':'\u0158', '\xc5\x99':'\u0159', '\xc5\x9a':'\u015a', '\xc5\x9b':'\u015b', '\xc5\x9c':'\u015c', '\xc5\x9d':'\u015d', '\xc5\x9e':'\u015e', '\xc5\x9f':'\u015f', '\xc5\xa0':'\u0160', '\xc5\xa1':'\u0161', '\xc5\xa2':'\u0162', '\xc5\xa3':'\u0163', '\xc5\xa4':'\u0164', '\xc5\xa5':'\u0165', '\xc5\xa6':'\u0166', '\xc5\xa7':'\u0167', '\xc5\xa8':'\u0168', '\xc5\xa9':'\u0169', '\xc5\xaa':'\u016a', '\xc5\xab':'\u016b', '\xc5\xac':'\u016c', '\xc5\xad':'\u016d', '\xc5\xae':'\u016e', '\xc5\xaf':'\u016f', '\xc5\xb0':'\u0170', '\xc5\xb1':'\u0171', '\xc5\xb2':'\u0172', '\xc5\xb3':'\u0173', '\xc5\xb4':'\u0174', '\xc5\xb5':'\u0175', '\xc5\xb6':'\u0176', '\xc5\xb7':'\u0177', '\xc5\xb8':'\u0178', '\xc5\xb9':'\u0179', '\xc5\xba':'\u017a', '\xc5\xbb':'\u017b', '\xc5\xbc':'\u017c', '\xc5\xbd':'\u017d', '\xc5\xbe':'\u017e', '\xc5\xbf':'\u017f', '\xc6\x80':'\u0180', '\xc6\x81':'\u0181', '\xc6\x82':'\u0182', '\xc6\x83':'\u0183', '\xc6\x84':'\u0184', '\xc6\x85':'\u0185', '\xc6\x86':'\u0186', '\xc6\x87':'\u0187', '\xc6\x88':'\u0188', '\xc6\x89':'\u0189', '\xc6\x8a':'\u018a', '\xc6\x8b':'\u018b', '\xc6\x8c':'\u018c', '\xc6\x8d':'\u018d', '\xc6\x8e':'\u018e', '\xc6\x8f':'\u018f', '\xc6\x90':'\u0190', '\xc6\x91':'\u0191', '\xc6\x92':'\u0192', '\xc6\x93':'\u0193', '\xc6\x94':'\u0194', '\xc6\x95':'\u0195', '\xc6\x96':'\u0196', '\xc6\x97':'\u0197', '\xc6\x98':'\u0198', '\xc6\x99':'\u0199', '\xc6\x9a':'\u019a', '\xc6\x9b':'\u019b', '\xc6\x9c':'\u019c', '\xc6\x9d':'\u019d', '\xc6\x9e':'\u019e', '\xc6\x9f':'\u019f', '\xc6\xa0':'\u01a0', '\xc6\xa1':'\u01a1', '\xc6\xa2':'\u01a2', '\xc6\xa3':'\u01a3', '\xc6\xa4':'\u01a4', '\xc6\xa5':'\u01a5', '\xc6\xa6':'\u01a6', '\xc6\xa7':'\u01a7', '\xc6\xa8':'\u01a8', '\xc6\xa9':'\u01a9', '\xc6\xaa':'\u01aa', '\xc6\xab':'\u01ab', '\xc6\xac':'\u01ac', '\xc6\xad':'\u01ad', '\xc6\xae':'\u01ae', '\xc6\xaf':'\u01af', '\xc6\xb0':'\u01b0', '\xc6\xb1':'\u01b1', '\xc6\xb2':'\u01b2', '\xc6\xb3':'\u01b3', '\xc6\xb4':'\u01b4', '\xc6\xb5':'\u01b5', '\xc6\xb6':'\u01b6', '\xc6\xb7':'\u01b7', '\xc6\xb8':'\u01b8', '\xc6\xb9':'\u01b9', '\xc6\xba':'\u01ba', '\xc6\xbb':'\u01bb', '\xc6\xbc':'\u01bc', '\xc6\xbd':'\u01bd', '\xc6\xbe':'\u01be', '\xc6\xbf':'\u01bf', '\xc7\x80':'\u01c0', '\xc7\x81':'\u01c1', '\xc7\x82':'\u01c2', '\xc7\x83':'\u01c3', '\xc7\x84':'\u01c4', '\xc7\x85':'\u01c5', '\xc7\x86':'\u01c6', '\xc7\x87':'\u01c7', '\xc7\x88':'\u01c8', '\xc7\x89':'\u01c9', '\xc7\x8a':'\u01ca', '\xc7\x8b':'\u01cb', '\xc7\x8c':'\u01cc', '\xc7\x8d':'\u01cd', '\xc7\x8e':'\u01ce', '\xc7\x8f':'\u01cf', '\xc7\x90':'\u01d0', '\xc7\x91':'\u01d1', '\xc7\x92':'\u01d2', '\xc7\x93':'\u01d3', '\xc7\x94':'\u01d4', '\xc7\x95':'\u01d5', '\xc7\x96':'\u01d6', '\xc7\x97':'\u01d7', '\xc7\x98':'\u01d8', '\xc7\x99':'\u01d9', '\xc7\x9a':'\u01da', '\xc7\x9b':'\u01db', '\xc7\x9c':'\u01dc', '\xc7\x9d':'\u01dd', '\xc7\x9e':'\u01de', '\xc7\x9f':'\u01df', '\xc7\xa0':'\u01e0', '\xc7\xa1':'\u01e1', '\xc7\xa2':'\u01e2', '\xc7\xa3':'\u01e3', '\xc7\xa4':'\u01e4', '\xc7\xa5':'\u01e5', '\xc7\xa6':'\u01e6', '\xc7\xa7':'\u01e7', '\xc7\xa8':'\u01e8', '\xc7\xa9':'\u01e9', '\xc7\xaa':'\u01ea', '\xc7\xab':'\u01eb', '\xc7\xac':'\u01ec', '\xc7\xad':'\u01ed', '\xc7\xae':'\u01ee', '\xc7\xaf':'\u01ef', '\xc7\xb0':'\u01f0', '\xc7\xb1':'\u01f1', '\xc7\xb2':'\u01f2', '\xc7\xb3':'\u01f3', '\xc7\xb4':'\u01f4', '\xc7\xb5':'\u01f5', '\xc7\xb6':'\u01f6', '\xc7\xb7':'\u01f7', '\xc7\xb8':'\u01f8', '\xc7\xb9':'\u01f9', '\xc7\xba':'\u01fa', '\xc7\xbb':'\u01fb', '\xc7\xbc':'\u01fc', '\xc7\xbd':'\u01fd', '\xc7\xbe':'\u01fe', '\xc7\xbf': '\u01ff'}
robj = re.compile('|'.join(rdict.keys()))
content = robj.sub(lambda m: rdict[m.group(0)], content)
content = re.sub('\s\s+',' ',content)
content = re.sub(' \.', '.',content)
content = re.sub(' ,', ',',content)
content = re.sub(' ;', ';',content)
content = re.sub(u'(\\n\s|\\n)', '', content)
return content
def multipleParts(partsPath,tree):
global creator
partsList = tree.xpath(partsPath)
for part in partsList:
creator = creator + etree.tostring(part, encoding='UTF-8', method='text').strip() + " " # This may need to be edited for your preferred separator between <part> elements.
creator = creator.strip() + "|"
def multipleCreators(genPath,tree):
global creator
num = lenTest(genPath,tree)
increment = 1
while increment <= num:
parts = genPath + "[" + str(increment) + "]" + "/part"
if lenTest(parts,tree) > 1:
multipleParts(parts,tree)
elif lenTest(parts,tree) == 1:
partsVal = tree.xpath(parts)[0]
creator = creator + etree.tostring(partsVal, encoding='UTF-8', method='text').strip() + "|"
increment += 1
def getOrigination(corpXPath, famXPath, nameXPath, persXPath,tree):
global creator
if lenTest(corpXPath,tree) > 1:
multipleCreators(corpXPath,tree)
elif lenTest(corpXPath,tree) == 1:
parts = corpXPath + "[1]/part"
partsVal = tree.xpath(parts)[0]
creator = creator + etree.tostring(partsVal, encoding='UTF-8', method='text').strip() + "|"
if lenTest(famXPath,tree) > 1:
multipleCreators(famXPath,tree)
elif lenTest(famXPath,tree) == 1:
parts = famXPath + "[1]/part"
partsVal = tree.xpath(parts)[0]
creator = creator + etree.tostring(partsVal, encoding='UTF-8', method='text').strip() + "|"
if lenTest(nameXPath,tree) > 1:
multipleCreators(nameXPath,tree)
elif lenTest(nameXPath,tree) == 1:
parts = nameXPath + "[1]/part"
partsVal = tree.xpath(parts)[0]
creator = creator + etree.tostring(partsVal, encoding='UTF-8', method='text').strip() + "|"
if lenTest(persXPath,tree) > 1:
multipleCreators(persXPath,tree)
elif lenTest(persXPath,tree) == 1:
parts = persXPath + "[1]/part"
partsVal = tree.xpath(parts)[0]
creator = creator + etree.tostring(partsVal, encoding='UTF-8', method='text').strip() + "|"
creator = cleanerApp(creator)
def getScopeContent(genPath,tree):
global scopeAndContent
scopeParagraphPath = genPath + "[1]" + "/p"
scopeParagraphs = tree.xpath(scopeParagraphPath)
for para in scopeParagraphs:
scopeAndContent = scopeAndContent + etree.tostring(para, encoding='UTF-8', method='text').strip() + "\u000a\u000a"
scopeAndContent = cleanerApp(scopeAndContent)
def stripProcessing(tree, each):
for instruction in tree.xpath("//processing-instruction('xml-stylesheet')"):
etree.strip_tags(instruction.getparent() or tree, instruction.tag)
xml_file = open(each, 'w')
xml_file.write(etree.tostring(tree, pretty_print=True))
xml_file.close()
today = datetime.date.today().strftime("%Y_%m_%d")
directory = "/Volumes/curatend-batch/data/EAD_Ingest_New_" + today # Hardcoded path may have issue if DCNS changes.
titleXPath = "/ead/control/filedesc/titlestmt/titleproper"
scopeXPath = "/ead/archdesc/scopecontent"
corpXPath = "/ead/archdesc/did/origination/corpname"
persXPath = "/ead/archdesc/did/origination/persname"
famXPath = "/ead/archdesc/did/origination/famname"
nameXPath = "/ead/archdesc/did/origination/name"
creator = ""
scopeAndContent = ""
def createCSV(directory, outputFile):
global creator, scopeAndContent
os.chdir(directory)
f = open(outputFile, 'w')
f.write("type,owner,access,files,dc:creator#administrative_unit,dc:title,dc:abstract,dc:creator,dc:source\n") # Non-Notre Dame users will want to remove or replace "type,owner,access". Users may wish to rename "files" (in f.write, not as variable)
files = glob.glob("*.xml")
for each in files:
tree = etree.parse(each)
titleString = cleanerApp(etree.tostring(tree.xpath(titleXPath)[0], encoding='UTF-8', method='text').strip())
creator = ""
scopeAndContent = ""
typeString = "Work-FindingAid" # Non-Notre Dame users will want to remove or replace.
ownerString = "rtillman" # Non-Notre Dame users will want to remove or replace.
accessString = "public;edit=rtillman" # Non-Notre Dame users will want to remove or replace.
departmentString = "University of Notre Dame::Hesburgh Libraries::University Archives"
sourceString = "http://archives.nd.edu/findaids/ead/xml/" + each # Will need to replace with local information or remove dc:source
titleString = cleanerApp(titleString.replace('"', '\u0022'))
getOrigination(corpXPath, famXPath, nameXPath, persXPath,tree)
creator = creator.replace('"', '\u0022')
getScopeContent(scopeXPath,tree)
scopeAndContent = scopeAndContent.replace('"', '\u0022')
creator = '"' + creator + '"'
titleString = '"' + titleString + '"'
scopeAndContent = '"' + scopeAndContent + '"'
line = typeString + "," + ownerString + "," + accessString + "," + each + "," + departmentString + "," + titleString + "," + scopeAndContent + "," + creator + "," + sourceString + "\n"
f.write(line)
stripProcessing(tree, each)
createCSV(directory,"metadata-1.csv")