-
Notifications
You must be signed in to change notification settings - Fork 1
/
markdown_utils.rb
151 lines (137 loc) · 5.49 KB
/
markdown_utils.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
require 'nokogiri'
module MarkdownUtils
XSLT_METALEX = Nokogiri::XSLT(File.open('xslt/metalex_to_bwb.xslt'))
XSLT_MARKDOWN = Nokogiri::XSLT(File.open('xslt/xml_to_markdown.xslt'))
# Deletes a markdown/txt file and its empty parent folders
def delete_file(entry, md_folder)
md_path = "#{md_folder}/README.md"
# puts "deleting #{File.expand_path(md_path)}, exists: #{File.exists?(File.expand_path(md_path))}"
if File.exists?(File.expand_path(md_path))
File.delete File.expand_path(md_path)
end
txt_path = "#{md_folder}/README.txt"
if File.exists?(File.expand_path(txt_path))
File.delete File.expand_path(txt_path)
end
folder = File.expand_path(md_folder)
# This loop is to check up to what directory it's safe to delete (so we don't end up with empty directories)
loop do
count = Dir["#{folder}/*"].length
if count > 0
# puts "#{folder} had some children"
break
else
if File.exists? folder
if Dir["#{folder}/.git"].length > 0
# puts "#{folder} contained .git: getting outta here"
break
else
# If folder is empty and exists, delete
# puts "Also deleted #{folder}"
FileUtils.rm_rf folder
end
end
folder = File.expand_path("#{folder}/..") # Go to parent folder
end
end
# else
# puts "Warning: #{entry[BWB_ID]} did not exist on #{md_path}"
md_folder
end
def write_file(text, folder, path)
FileUtils.mkdir_p folder unless File.exists?(folder) # Make sure that path exists
# if path.length < 25
# puts path
# end
open(path, 'w+') do |f|
f.puts text
end
end
def get_plain_text(bwbid, date_last_modified, logger=nil)
text = nil
begin
puts "Getting txt for #{bwbid}/#{date_last_modified}"
zipped_file = open("http://wetten.overheid.nl/#{bwbid}/geldigheidsdatum_#{date_last_modified}/opslaan_in_ascii", :read_timeout => 20*60)
Zip::File.open(zipped_file) do |zip|
# Strip first 3 lines which contain '(Tekst geldend op: DD-MM-YYYY)'
text = zip.read(zip.first).force_encoding('utf-8').lines.to_a[3..-1].join
end
rescue
if logger
logger.error "Could not open #{"http://wetten.overheid.nl/#{bwbid}/geldigheidsdatum_#{date_last_modified}/opslaan_in_ascii"}"
end
puts "ERROR: Could not open #{"http://wetten.overheid.nl/#{bwbid}/geldigheidsdatum_#{date_last_modified}/opslaan_in_ascii"}"
end
text
end
# Convert the given entry XML to markdown, for the given BWB law list.
def convert_to_markdown(bwb_id, entry, law_list, str_xml)
cache_path = "cache/#{bwb_id}.#{entry[DATE_LAST_MODIFIED]}.md"
old_cache_path = "cache/#{bwb_id}%2F#{entry[DATE_LAST_MODIFIED]}.md"
if File.exist? cache_path
markdown = File.open(cache_path).read
elsif File.exist? old_cache_path
markdown = File.open(old_cache_path).read
else
# Else convert XML to markdown
xml_base = Nokogiri::XML(str_xml)
set_hrefs(bwb_id, law_list, xml_base)
markdown = XSLT_MARKDOWN.apply_to(xml_base)
# kramdown escapes tables :(
# kramdown = Kramdown::Document.new(markdown, :input => 'markdown') # Pretty print markdown
markdown = format_markdown(markdown)
File.open(cache_path, 'w+') do |f|
f.puts markdown
end
puts "written #{cache_path} to cache"
end
markdown
end
def format_markdown(markdown)
markdown.gsub!(/^[ ]+/, '') # Remove starting spaces
markdown.gsub!(/^[#]+\s*$/, '') # Remove empty headers
markdown.gsub!(/(\n\s*){3,}/, "\n\n") # Remove more than two newlines
substitute!(/^[0-9]+\.(?<spaces>\s*)[^\s]*$/, markdown, {:spaces => ' '}) # pretty print lists, e.g., '1. bla bla' instead of '1. bla bla'
if markdown.gsub(/\s+/, '').length > 0 # If it's not just whitespaces
markdown.prepend("<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\n")
else
markdown = ''
end
markdown
end
private
def set_hrefs(bwb_id, law_list, xml_base)
external_references = xml_base.xpath('.//extref[@doc]')
if external_references and external_references.respond_to?('[]')
external_references.each do |el|
#1.0:v:BWBR0001834&artikel=15
if el and el.respond_to?('[]')
juriconnect = el['doc']
/.\..:.*:(BWB[^&]*)/ =~ juriconnect
to_bwb = $1
if to_bwb and law_list[bwb_id]
# Don't handle things like:
# <extref doc="" label="verordening" reeks="Celex" compleet="nee">Verordening (EG) nr. 2201/2003</extref>
from = law_list[bwb_id]['path']
external_document = law_list[to_bwb]
if external_document
to = external_document['path']
path = get_path_to(from, to)
el['ref'] = path
else
case to_bwb
when 'BWBR0003018', 'BWBR0002530', 'BWBR0002664', 'BWBR0003011', 'BWBR0006064', 'BWBR0026755', 'BWBR0002147' # Do nothing for 'known unknowns'
else
puts "WARNING: handling extref; link to #{to_bwb} is not resolvable with our BWB ID list"
end
end
else
el['ref'] = ''
end
end
end
else
puts "WARNING: #{external_references} did not respond to []"
end
end
end