forked from lcnetdev/lds-processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbf2m-batch.py
137 lines (111 loc) · 4.28 KB
/
bf2m-batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python
# works on files in a dir (daily transforms to bf, not exported from db)
# Takes a directory full of bf files, creates 1 or more marc files in marc:collection
import glob
import os
import sys
import shutil
import lxml
from lxml import etree as ET
from lxml.builder import ElementMaker
import json
import multiprocessing
def runxslt(parsedfile, xslconvert,transformname):
print(ET.tostring(parsedfile))
print ("transforming with "+transformname)
root = parsedfile.getroot()
if (transformname=="bf2marc") :
print(ET.tostring(root))
try:
result = xslconvert(parsedfile)
if (transformname=="bf2marc") :
print(ET.tostring(parsedfile))
except:
print ("error using "+transformname)
print(ET.tostring(result))
return result;
if len(sys.argv) >1 :
path_to_data = sys.argv[1]
else:
# path_to_data = "/marklogic/nate/python-tutorial/"
path_to_data= os.getcwd()
output_filename = path_to_data + '/out/mrc.xml'
efilename= path_to_data+ '/out/error.txt'
files = list(glob.glob(path_to_data + '/split_000000*.xml.rdf'))
shutil.rmtree("out")
os.mkdir("out")
print(len(files))
# stylesheets declared:
graphxsl= ET.XSLT( ET.parse('graphiphy.xsl'))
bf2marcxsl=ET.XSLT(ET.parse("/marklogic/applications/bibframe2marc/bibframe2marc.xsl"))
for file in files:
outf=file.find('.xml')
print (outf)
outfil=file[len(path_to_data):outf]
outfile= 'out'+outfil+'.mrc.xml'
print(outfile)
counter=0
with open(outfile,'wb') as out:
counter+=1
print ("file: " +file)
if counter % 100 == 0:
print(counter,'/',len(files))
#*********************** splits loading into bfdb alread are graphed; skip this entirely
tree = ET.parse(file)
# root = tree.getroot()
# xslt = ET.parse('graphiphy.xsl')
# transform = ET.XSLT(xslt)
# graphed = transform(root)
# graphed=runxslt(tree, graphxsl,"graphiphy")
# bf2marc=ET.parse("/marklogic/applications/bibframe2marc/bibframe2marc.xsl")
# bf2marcxsl=ET.XSLT(bf2marc)
# graphedxml=graphed.getroot()
# print(ET.tostring(graphedxml))
# create output marcxml:collection:
M= ElementMaker(namespace="http://www.loc.gov/MARC21/slim" ,
nsmap={"marc":"http://www.loc.gov/MARC21/slim"})
coll=M.collection()
# for each "graph/record"
root=tree.getroot()
for c in root.iterfind('.//{http://id.loc.gov/ontologies/lclocal/}graph'):
# print(ET.tostring(c))
# print (c.find("{http://id.loc.gov/ontologies/bibframe/}Work"))
E = ElementMaker(namespace="http://www.w3.org/1999/02/22-rdf-syntax-ns#",
nsmap={"lclocal":"http://id.loc.gov/ontologies/lclocal/",
"rdfs":"http://www.w3.org/2000/01/rdf-schema#",
"rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"madsrdf":"http://www.loc.gov/mads/rdf/v1#",
"bf":"http://id.loc.gov/ontologies/bibframe/",
"bflc":"http://id.loc.gov/ontologies/bflc/"})
f=E.RDF(c.find("{http://id.loc.gov/ontologies/bibframe/}Work"),
c.find("{http://id.loc.gov/ontologies/bibframe/}Instance")
)
# for nnn in w:
# print( nnn)
# print (nnn.find("http://www.w3.org/1999/02/22-rdf-syntax-ns#}about"))
#result has marc
# print ET.tostring(f)
try:
result=bf2marcxsl(f)
# result = runxslt(f.getroot(), bf2marcxsl,"bf2marc")
except OSError as err:
print("OS error: {0}".format(err))
except ValueError:
print("Could not convert data to an integer.")
except:
#<class 'lxml.etree.XSLTApplyError'>
print("Unexpected error:", sys.exc_info()[0], sys.exc_info()[1] )
for info in sys.exc_info():
print(info)
errorfile=open(efilename,'a')
errorfile.write(ET.tostring(result))
errorfile.close
pass
# convert xslt result to xml
record= ET.XML(bytes(result))
# print (ET.tostring(record))
# insert the record into the collection
coll.insert(1,record)
out.write(ET.tostring(coll))
######
out.close