-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathxlsx2json
299 lines (255 loc) · 9.12 KB
/
xlsx2json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# -*- coding: utf-8 -*-
# xlsx2json.py - convert sesamo-derived files to archimista-compatible
# json-like files.
# Copyright (C) 2015 Matteo Tiberti <[email protected]>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Rompicazzo GNU General Public License.
# This is the same as the GNU General Public License, as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version, with one additional clause.
# Any work derived from this software MUST print in the standard output
# (or similar output stream) as the FIRST output lines
# of the program the following sentence: "IL GELMA E' UN PUZZONE",
# possibly highlighted in some way.
# The message containing this print MUST be visible to the user for at least
# one second in the program output. If the software does
# not print to a terminal, the use of interface elements to display
# the text is mandatory.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import openpyxl as oxl
import codecs
import json
import hashlib
import datetime
import zipfile
import argparse
import logging
from collections import OrderedDict
from sys import stdout
# Fix json in order to support fucked up """"JSON"""" Archimista format
# by overriding _make_iterencode. Fuck that shit
class ArchimistaJSONEncoder(json.JSONEncoder):
def default(self, o):
if o == 'true':
return [True]
if o == 'false':
return [False]
if o == 'null':
return [None]
return json.JSONEncoder.default(self, o)
"""
def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
_key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
ValueError=ValueError,
basestring=basestring,
dict=dict,
float=float,
id=id,
int=int,
isinstance=isinstance,
list=list,
long=long,
str=str,
tuple=tuple,
):
def _iterencode_list(lst, _current_indent_level):
if not lst:
yield '[]'
return
if markers is not None:
markerid = id(lst)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = lst
buf = '['
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
separator = _item_separator + newline_indent
buf += newline_indent
else:
newline_indent = None
separator = _item_separator
first = True
for value in lst:
if first:
first = False
else:
buf = separator
if isinstance(value, basestring): # This is where the magic happens
print value.strip().lower()
if value.strip().lower() == 'true':
print "VERO", value
yield buf + 'true'
elif value.strip().lower() == 'false':
yield buf + 'false'
elif value.strip().lower() == 'null':
yield buf + 'null'
else:
yield buf + _encoder(value)
elif value is None:
yield buf + 'null'
elif value is True or value.strip() == 'true':
yield buf + 'true'
elif value is False or value.strip() == 'false':
yield buf + 'false'
elif isinstance(value, (int, long)):
yield buf + str(value)
elif isinstance(value, float):
yield buf + _floatstr(value)
else:
yield buf
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
yield ']'
if markers is not None:
del markers[markerid]
"""
#stdout = codecs.getwriter(stdout.encoding)(stdout);
def get_headers(ws):
headers = []
logging.debug("getting headers...")
for cell in ws.rows[0]:
if cell.value != '':
headers.append(cell.value)
#logging.debug("HEADERS for %s: %s" %("asd", "\n".join(headers)))
logging.debug("done!")
return headers
def empty_row(ws_name, row):
markers = { "fond":1,
"unit":2,
"fond_event":0,
"unit_event":0 }
if row[markers[ws_name]].value == None or row[markers[ws_name]].value == "":
return True
return False
def archimista_now():
dt = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
tz = datetime.datetime.now().strftime('%z') # let's to it this way now; may use a real tzinfo later
if not tz:
tz = "+0100" # Default to CET
return "%s%s:%s" % (dt, tz[0:3],tz[3:5])
# Some variables for the metadata file
#####
mode = 'full'
version = 121
producer = 'i686-pc-mingw32'
attached_entity = 'Fond'
#####
#Let's go !
print """
##########################
# #
# #
# IL GELMA E' UN PUZZONE #
# #
# #
##########################
"""
parser = argparse.ArgumentParser(description='Transform xlsx files derived from Sesamo databases to the Archimista "JSON" format.')
parser.add_argument('-v','--verbose',dest='verbose',help="verbose mode", action='store_true', default=False)
parser.add_argument('-d','--debug', dest='debug', help="debug mode", action='store_true', default=False)
parser.add_argument('-o','--output', dest='outfile', help="output file name", default=None)
parser.add_argument('infile', default=None)
args = parser.parse_args()
if args.outfile == None:
args.outfile = args.infile[0:-4]+"aef"
if args.debug:
logging.basicConfig(level=logging.DEBUG)
elif args.verbose:
logging.basicConfig(level=logging.INFO)
else:
logging.basicConfig(level=logging.WARNING)
logging.debug("loading worksheet...")
wb = oxl.load_workbook(filename = args.infile)
logging.debug("loaded worksheet")
wss_names = wb.get_sheet_names()
logging.debug("names are: %s" % ", ".join(wss_names))
wss = dict ( zip(wss_names, [wb.get_sheet_by_name(wss_names[i]) for i in range(len(wss_names))] ) )
#print zip(wss_names, [get_headers(wss[i]) for i in wss_names])
ws_headers = OrderedDict( zip(wss_names, [get_headers(wss[i]) for i in wss_names]) )
outstr = ""
# Write data file
data = []
for ws_name in wss_names:
logging.debug("Working on: sheet %s" % ws_name)
linen = 0
ws = wss[ws_name]
for row in ws.rows[1:]:
linen += 1
logging.debug("doing line %d" % linen)
if empty_row(ws_name, row):
continue
data = OrderedDict([(ws_name,OrderedDict())])
for i,ws_header in enumerate(ws_headers[ws_name]):
if row[i].is_date:
row[i].value = str(row[i].value)
elif row[i].value == "null":
row[i].value = None
elif row[i].value == "true":
row[i].value = True
elif row[i].value == "false":
row[i].value = False
if isinstance(row[i].value, basestring):
#asd = row[i].value.decode('string_escape')
#data[-1][ws_name][ws_header] = row[i].value.decode('utf-8')
try:
data[ws_name][ws_header] = row[i].value.decode('unicode-escape')
except:
data[ws_name][ws_header] = row[i].value
#data[-1][ws_name][ws_header] = unicode(row[i].value)
#print "AA--", row[i].value
#print " --", asd
#print "BB--", data[-1][ws_name][ws_header]
#data[-1][ws_name][ws_header] = row[i].value
else:
data[ws_name][ws_header] = row[i].value
#for line in data:
outstr += "%s\r\n\r\n" % json.dumps(data,
ensure_ascii = True,
encoding="utf8",
separators=(',', ':'),
cls=ArchimistaJSONEncoder)
logging.debug("Writing outfile")
if args.debug:
outfh = codecs.open("out.json",mode='w',encoding='utf8')
outfh.write(outstr)
outfh.close()
# write metadata file
m = hashlib.sha256()
m.update(outstr)
metadata = OrderedDict([('mode',mode),
('date',archimista_now()),
('checksum',m.hexdigest()),
('version',version),
('producer',producer),
('attached_entity',attached_entity)])
mdstr = json.dumps(metadata,
ensure_ascii = True,
encoding="utf8",
separators=(',', ':'),
cls=ArchimistaJSONEncoder)
if args.debug:
mdfh = open('metadata.json','w')
mdfh.write(mdstr)
mdfh.close()
# write zip
zipfh = zipfile.ZipFile(args.outfile,'w')
zipfh.writestr('data.json',outstr)
zipfh.writestr('metadata.json',mdstr)
zipfh.close()