From d4c705cadcabc7adccf0a4009fce459ea739cfae Mon Sep 17 00:00:00 2001 From: eiglesias34 Date: Tue, 14 May 2024 14:29:41 +0200 Subject: [PATCH] Added new RML formulation, RML-Star and FNML for CSV, and fixed some encoding issues --- README.md | 2 +- VERSION | 2 +- rdfizer/rdfizer/__init__.py | 7660 +++++++++++++++++++------ rdfizer/rdfizer/fnml_functions.py | 151 + rdfizer/rdfizer/functions.py | 274 +- rdfizer/rdfizer/inner_functions.py | 680 +++ rdfizer/rdfizer/mapping_functions.py | 134 + rdfizer/rdfizer/semantify.py | 7688 ++++++++++++++++++++------ 8 files changed, 13189 insertions(+), 3402 deletions(-) create mode 100644 rdfizer/rdfizer/fnml_functions.py create mode 100644 rdfizer/rdfizer/inner_functions.py create mode 100644 rdfizer/rdfizer/mapping_functions.py diff --git a/README.md b/README.md index adf52a3..86f37d6 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ You can easily customize your own configurations from the set of features that S ## Version ``` -4.7.3.5 +4.7.4 ``` ## RML-Test Cases diff --git a/VERSION b/VERSION index b9d288c..5ca7df9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.7.3.5 \ No newline at end of file +4.7.4 \ No newline at end of file diff --git a/rdfizer/rdfizer/__init__.py b/rdfizer/rdfizer/__init__.py index 9dfa9ef..6ab6360 100755 --- a/rdfizer/rdfizer/__init__.py +++ b/rdfizer/rdfizer/__init__.py @@ -13,7 +13,17 @@ import pandas as pd import xml.etree.ElementTree as ET from urllib.request import urlopen +import gzip +import requests +import shutil +import zipfile +import io +import tarfile +from SPARQLWrapper import SPARQLWrapper, JSON from .functions import * +from .fnml_functions import * +from .mapping_functions import * +from .inner_functions import * import logging try: @@ -64,6 +74,12 @@ base = "" global blank_message blank_message = True +global delimiter +delimiter = {} +global logical_dump +logical_dump = {} +global current_logical_dump +current_logical_dump = "" global general_predicates general_predicates = {"http://www.w3.org/2000/01/rdf-schema#subClassOf": "", "http://www.w3.org/2002/07/owl#sameAs": "", @@ -186,7 +202,7 @@ def dictionary_table_update(resource): id_number += 1 -def join_iterator(data, iterator, parent, child): +def join_iterator(data, iterator, parent, child, triples_map_list): if iterator != "": new_iterator = "" temp_keys = iterator.split(".") @@ -219,28 +235,28 @@ def join_iterator(data, iterator, parent, child): row = row[list(row.keys())[0]] if isinstance(row, list): for sub_row in row: - join_iterator(sub_row, iterator, parent, child) + join_iterator(sub_row, iterator, parent, child, triples_map_list) executed = False break elif isinstance(row, str): row = [] break else: - join_iterator(row[list(row.keys())[0]], "", parent, child) + join_iterator(row[list(row.keys())[0]], "", parent, child, triples_map_list) else: path = jsonpath_find(temp_keys[len(temp_keys) - 1], row, "", []) for key in path[0].split("."): if key in temp_keys: - join_iterator(row[key], "", parent, child) + join_iterator(row[key], "", parent, child, triples_map_list) elif key in row: row = row[key] if isinstance(row, list): for sub_row in row: - join_iterator(sub_row, iterator, parent, child) + join_iterator(sub_row, iterator, parent, child, triples_map_list) executed = False break elif isinstance(row, dict): - join_iterator(row, iterator, parent, child) + join_iterator(row, iterator, parent, child, triples_map_list) executed = False break elif isinstance(row, str): @@ -249,23 +265,23 @@ def join_iterator(data, iterator, parent, child): if new_iterator != ".": if "*" == new_iterator[-2]: for sub_row in row: - join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child) + join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child, triples_map_list) executed = False break if "[*][*]" in new_iterator: for sub_row in row: for sub_sub_row in row[sub_row]: - join_iterator(sub_sub_row, iterator.replace(new_iterator[:-1], ""), parent, child) + join_iterator(sub_sub_row, iterator.replace(new_iterator[:-1], ""), parent, child, triples_map_list) executed = False break if isinstance(row, list): for sub_row in row: - join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child) + join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child, triples_map_list) executed = False break else: if parent.triples_map_id + "_" + child.child[0] not in join_table: - hash_maker([data], parent, child) + hash_maker([data], parent, child,"", triples_map_list) else: hash_update([data], parent, child, parent.triples_map_id + "_" + child.child[0]) @@ -329,28 +345,59 @@ def hash_update(parent_data, parent_subject, child_object, join_id): join_table[join_id].update(hash_table) -def hash_maker(parent_data, parent_subject, child_object): +def hash_maker(parent_data, parent_subject, child_object, quoted, triples_map_list): global blank_message hash_table = {} for row in parent_data: - if child_object.parent[0] in row.keys(): - if row[child_object.parent[0]] in hash_table: - if duplicate == "yes": - if parent_subject.subject_map.subject_mapping_type == "reference": - value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, - parent_subject.iterator) - if value != None: + if quoted == "": + if child_object.parent[0] in row.keys(): + if row[child_object.parent[0]] in hash_table: + if duplicate == "yes": + if parent_subject.subject_map.subject_mapping_type == "reference": + value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, + parent_subject.iterator) + if value != None: + if "http" in value and "<" not in value: + value = "<" + value[1:-1] + ">" + elif "http" in value and "<" in value: + value = value[1:-1] + if value not in hash_table[row[child_object.parent[0]]]: + hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: + if string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, + parent_subject.iterator) != None: + value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", + ignore, parent_subject.iterator) + if value != None: + if parent_subject.subject_map.term_type != None: + if "BlankNode" in parent_subject.subject_map.term_type: + if "/" in value: + value = "_:" + encode_char(value.replace("/", "2F")).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + if blank_message: + logger.warning( + "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + value = "_:" + encode_char(value).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + else: + value = "<" + value + ">" + hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: + if parent_subject.subject_map.subject_mapping_type == "reference": + value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, + parent_subject.iterator) if "http" in value and "<" not in value: value = "<" + value[1:-1] + ">" elif "http" in value and "<" in value: value = value[1:-1] - if value not in hash_table[row[child_object.parent[0]]]: hash_table[row[child_object.parent[0]]].update({value: "object"}) - else: - if string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, - parent_subject.iterator) != None: - value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", - ignore, parent_subject.iterator) + else: + value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, + parent_subject.iterator) if value != None: if parent_subject.subject_map.term_type != None: if "BlankNode" in parent_subject.subject_map.term_type: @@ -369,15 +416,17 @@ def hash_maker(parent_data, parent_subject, child_object): else: value = "<" + value + ">" hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: if parent_subject.subject_map.subject_mapping_type == "reference": value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, parent_subject.iterator) - if "http" in value and "<" not in value: - value = "<" + value[1:-1] + ">" - elif "http" in value and "<" in value: - value = value[1:-1] - hash_table[row[child_object.parent[0]]].update({value: "object"}) + if value != None: + if "http" in value and "<" not in value: + value = "<" + value[1:-1] + ">" + elif "http" in value and "<" in value: + value = value[1:-1] + hash_table.update({row[child_object.parent[0]]: {value: "object"}}) else: value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, parent_subject.iterator) @@ -398,40 +447,26 @@ def hash_maker(parent_data, parent_subject, child_object): value = value.replace(".", "2E") else: value = "<" + value + ">" - hash_table[row[child_object.parent[0]]].update({value: "object"}) - - else: - if parent_subject.subject_map.subject_mapping_type == "reference": - value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, - parent_subject.iterator) - if value != None: - if "http" in value and "<" not in value: - value = "<" + value[1:-1] + ">" - elif "http" in value and "<" in value: - value = value[1:-1] - hash_table.update({row[child_object.parent[0]]: {value: "object"}}) - else: - value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, - parent_subject.iterator) - if value != None: - if parent_subject.subject_map.term_type != None: - if "BlankNode" in parent_subject.subject_map.term_type: - if "/" in value: - value = "_:" + encode_char(value.replace("/", "2F")).replace("%", "") - if "." in value: - value = value.replace(".", "2E") - if blank_message: - logger.warning( - "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") - blank_message = False - else: - value = "_:" + encode_char(value).replace("%", "") - if "." in value: - value = value.replace(".", "2E") + hash_table.update({row[child_object.parent[0]]: {value: "object"}}) + else: + for triples in inner_semantify_file(parent_subject, triples_map_list, ",", row, base): + if triples != None: + if isinstance(child_object.parent,list): + parent = child_object.parent[0] + else: + parent = child_object.parent + if row[parent] in hash_table: + if duplicate == "yes": + if triples not in hash_table[row[parent]]: + hash_table[row[parent]].update({triples : "subject"}) else: - value = "<" + value + ">" - hash_table.update({row[child_object.parent[0]]: {value: "object"}}) - join_table.update({parent_subject.triples_map_id + "_" + child_object.child[0]: hash_table}) + hash_table[row[parent]].update({triples : "subject"}) + else: + hash_table.update({row[parent] : {triples : "subject"}}) + if isinstance(child_object.child,list): + join_table.update({parent_subject.triples_map_id + "_" + child_object.child[0] : hash_table}) + else: + join_table.update({"quoted_" + parent_subject.triples_map_id + "_" + child_object.child : hash_table}) def hash_maker_list(parent_data, parent_subject, child_object): @@ -922,8 +957,10 @@ def mappings_expansion(triples_map_list): subject_map = triples_map.subject_map else: subject_map = tm.SubjectMap(triples_map.subject_map.value, triples_map.subject_map.condition, - triples_map.subject_map.subject_mapping_type, [None], - triples_map.subject_map.term_type, triples_map.subject_map.graph) + triples_map.subject_map.subject_mapping_type, + triples_map.subject_map.parent,triples_map.child, [None], + triples_map.subject_map.term_type, triples_map.subject_map.graph, + triples_map.func_result) if po.object_map.mapping_type == "parent triples map": if po.object_map.child != None: for triples_map_element in triples_map_list: @@ -954,7 +991,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -964,7 +1004,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -973,7 +1016,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -982,7 +1028,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] elif str(triples_map.file_format).lower() == "csv": if triples_map.data_source == triples_map_element.data_source: if po.object_map.child[0] == po.object_map.parent[0]: @@ -1005,7 +1054,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1014,7 +1066,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1023,7 +1078,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: if triples_map.query == triples_map_element.query or triples_map.tablename == triples_map_element.tablename: if po.object_map.child[0] == po.object_map.parent[0]: @@ -1046,7 +1104,11 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1055,7 +1117,11 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1064,7 +1130,11 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type, + triples_map.mappings_type)] break else: for triples_map_element in triples_map_list: @@ -1093,7 +1163,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: if len(triples_map_element.predicate_object_maps_list) > 1: po.object_map.value = po.object_map.value + "_1" @@ -1105,7 +1178,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] elif str(triples_map.file_format).lower() == "csv": if triples_map_element.subject_map.subject_mapping_type == "template": object_map = tm.ObjectMap("template", @@ -1126,13 +1202,19 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: new_list += [tm.TriplesMap(triples_map.triples_map_id + "_" + str(i), triples_map.data_source, subject_map, [po], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: if ( triples_map.query != "None" and triples_map_element.query != "None" and triples_map.query == triples_map_element.query) or ( @@ -1156,18 +1238,26 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: new_list += [tm.TriplesMap(triples_map.triples_map_id + "_" + str(i), triples_map.data_source, subject_map, [po], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] break else: new_list += [tm.TriplesMap(triples_map.triples_map_id + "_" + str(i), triples_map.data_source, subject_map, [po], triples_map.reference_formulation, - triples_map.iterator, triples_map.tablename, triples_map.query)] + triples_map.iterator, triples_map.tablename, triples_map.query, + triples_map.function,triples_map.func_map_list, + triples_map.mappings_type)] i += 1 else: new_list += [triples_map] @@ -1182,20 +1272,21 @@ def mappings_expansion(triples_map_list): if str(triples_map.file_format).lower() == "csv" or triples_map.file_format == "JSONPath" or triples_map.file_format == "XPath": if triples_map.data_source == triples_map_element.data_source: if po.object_map.child[0] == po.object_map.parent[0]: - if triples_map_element.subject_map.subject_mapping_type == "template": - object_map = tm.ObjectMap("template", - triples_map_element.subject_map.value, "None", - "None", "None", - triples_map_element.subject_map.term_type, - "None", "None") - else: - object_map = tm.ObjectMap("reference", - triples_map_element.subject_map.value, "None", - "None", "None", - triples_map_element.subject_map.term_type, - "None", "None") - pom_list.append( - tm.PredicateObjectMap(po.predicate_map, object_map, po.graph)) + """if triples_map_element.subject_map.subject_mapping_type == "template": + object_map = tm.ObjectMap("template", + triples_map_element.subject_map.value, "None", + "None", "None", + triples_map_element.subject_map.term_type, + "None", "None") + else: + object_map = tm.ObjectMap("reference", + triples_map_element.subject_map.value, "None", + "None", "None", + triples_map_element.subject_map.term_type, + "None", "None") + pom_list.append( + tm.PredicateObjectMap(po.predicate_map, object_map, po.graph))""" + pom_list.append(po) else: pom_list.append(po) else: @@ -1235,7 +1326,7 @@ def mappings_expansion(triples_map_list): new_list += [ tm.TriplesMap(triples_map.triples_map_id, triples_map.data_source, triples_map.subject_map, pom_list, triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query,triples_map.function,triples_map.func_map_list,triples_map.mappings_type)] return new_list @@ -1267,27 +1358,79 @@ def mapping_parser(mapping_file): sys.exit(1) if new_formulation == "yes": - mapping_query = """ + function_query = """ prefix rr: prefix rml: prefix d2rq: prefix td: prefix hctl: + prefix dcat: + prefix void: + prefix sd: SELECT DISTINCT * WHERE { + OPTIONAL { + ?function_id rml:function ?function . + OPTIONAL { + ?function_id rml:input ?input. + ?input rml:parameter ?param. + OPTIONAL { + ?input rml:inputValue ?input_value. + } + OPTIONAL { + ?input rml:inputValueMap ?input_map. + OPTIONAL {?input_map rml:reference ?param_reference.} + OPTIONAL {?input_map rml:template ?param_template.} + OPTIONAL {?input_map rml:functionExecution ?param_func.} + } + } + } + } + """ + mapping_query = """ + prefix rr: + prefix rml: + prefix d2rq: + prefix td: + prefix hctl: + prefix dcat: + prefix void: + prefix sd: + SELECT DISTINCT * + WHERE { # Subject ------------------------------------------------------------------------- + OPTIONAL{?triples_map_id a ?mappings_type} ?triples_map_id rml:logicalSource ?_source . OPTIONAL{ ?_source rml:source ?source_attr . - ?source_attr rml:root ?root . + OPTIONAL {?source_attr rml:root ?root .} ?source_attr rml:path ?data_source } OPTIONAL{ ?_source rml:source ?data_link . - ?data_link td:hasForm ?form . + ?data_link dcat:downloadURL ?url_source . + } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link void:dataDump ?url_source . + } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link dcat:url ?url_source . + ?data_link dcat:dialect ?dialect . + ?dialect dcat:delimiter ?delimiter . + } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link td:hasPropertyAffordance ?has_form . + ?has_form td:hasForm ?form . ?form hctl:hasTarget ?url_source . } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link sd:endpoint ?url_source . + } OPTIONAL {?_source rml:referenceFormulation ?ref_form .} OPTIONAL { ?_source rml:iterator ?iterator . } OPTIONAL { ?_source rr:tableName ?tablename .} @@ -1297,20 +1440,44 @@ def mapping_parser(mapping_file): OPTIONAL {?_subject_map rml:template ?subject_template .} OPTIONAL {?_subject_map rml:reference ?subject_reference .} OPTIONAL {?_subject_map rml:constant ?subject_constant} + OPTIONAL {?_subject_map rml:quotedTriplesMap ?subject_quoted . + OPTIONAL { + ?_subject_map rml:joinCondition ?join_condition . + ?join_condition rml:child ?subject_child_value; + rml:parent ?subject_parent_value. + } + } OPTIONAL { ?_subject_map rml:class ?rdf_class . } OPTIONAL { ?_subject_map rml:termType ?termtype . } OPTIONAL { ?_subject_map rml:graph ?graph . } - OPTIONAL { ?_subject_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:constant ?graph . } - OPTIONAL { ?_subject_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:template ?graph . } + OPTIONAL { ?_subject_map rml:graphMap ?subject_graph_structure . + ?subject_graph_structure rml:constant ?graph . + OPTIONAL {?subject_graph_structure rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?subject_graph_dump.} + } + OPTIONAL { ?_subject_map rml:graphMap ?subj_graph_structure . + ?subj_graph_structure rml:template ?graph . + OPTIONAL {?subj_graph_structure rml:logicalTarget ?subj_output . + ?subj_output rml:target ?subj_dump. + ?subj_dump void:dataDump ?subject_graph_dump.} + } + OPTIONAL {?_subject_map rml:functionExecution ?subject_function . + OPTIONAL { + ?_subject_map rml:returnMap ?output_map . + ?output_map rml:constant ?subject_output . + } + } + OPTIONAL {?_subject_map rml:logicalTarget ?output. + ?output rml:target ?dump. + ?dump void:dataDump ?subject_dump. + } # Predicate ----------------------------------------------------------------------- OPTIONAL { ?triples_map_id rml:predicateObjectMap ?_predicate_object_map . OPTIONAL { - ?triples_map_id rml:predicateObjectMap ?_predicate_object_map . ?_predicate_object_map rml:predicateMap ?_predicate_map . ?_predicate_map rml:constant ?predicate_constant . } @@ -1324,13 +1491,42 @@ def mapping_parser(mapping_file): } OPTIONAL { ?_predicate_object_map rml:predicate ?predicate_constant_shortcut . - } - + } + OPTIONAL { + ?_predicate_object_map rml:predicateMap ?_predicate_map . + ?_predicate_map rml:functionExecution ?predicate_function . + OPTIONAL { + ?_predicate_map rml:returnMap ?output_map . + ?output_map rml:constant ?predicate_output . + } + } + OPTIONAL { + ?_predicate_map rml:logicalTarget ?pre_output . + ?pre_output rml:target ?pre_dump. + ?pre_dump void:dataDump ?predicate_dump. + } # Object -------------------------------------------------------------------------- + OPTIONAL { ?_predicate_object_map rml:objectMap ?_object_map . ?_object_map rml:constant ?object_constant . + OPTIONAL { ?_object_map rml:language ?language .} + OPTIONAL {?_object_map rml:languageMap ?language_map. + OPTIONAL {?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:constant ?language.} + OPTIONAL {?language_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?language_dump.} + } + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + OPTIONAL {?datatype_map rml:template ?datatype_value.} + OPTIONAL {?datatype_map rml:constant ?datatype.} + OPTIONAL {?datatype_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?datatype_dump.} + } + OPTIONAL {?_object_map rml:termType ?term .} OPTIONAL { ?_object_map rml:datatype ?object_datatype . } @@ -1350,7 +1546,19 @@ def mapping_parser(mapping_file): ?_object_map rml:reference ?object_reference . OPTIONAL { ?_object_map rml:language ?language .} OPTIONAL {?_object_map rml:languageMap ?language_map. - ?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:constant ?language.} + OPTIONAL {?language_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?language_dump.} + } + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + OPTIONAL {?datatype_map rml:template ?datatype_value.} + OPTIONAL {?datatype_map rml:constant ?object_datatype.} + OPTIONAL {?datatype_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?datatype_dump.} + } OPTIONAL {?_object_map rml:termType ?term .} OPTIONAL { ?_object_map rml:datatype ?object_datatype . @@ -1363,17 +1571,63 @@ def mapping_parser(mapping_file): ?_object_map rml:joinCondition ?join_condition . ?join_condition rml:child ?child_value; rml:parent ?parent_value. + OPTIONAL{?parent_value rml:functionExecution ?executed_parent . + ?executed_parent rml:function ?parent_function .} + OPTIONAL{?child_value rml:functionExecution ?executed_child . + ?executed_child rml:function ?child_function .} OPTIONAL {?_object_map rml:termType ?term .} } } + OPTIONAL { + ?_predicate_object_map rml:objectMap ?_object_map . + ?_object_map rml:quotedTriplesMap ?object_quoted . + OPTIONAL { + ?_object_map rml:joinCondition ?join_condition . + ?join_condition rml:child ?child_value; + rml:parent ?parent_value. + } + } OPTIONAL { ?_predicate_object_map rml:object ?object_constant_shortcut . } + OPTIONAL{ + OPTIONAL { + ?_object_map rml:datatype ?object_datatype . + } + ?_object_map rml:functionExecution ?function. + OPTIONAL { + ?_object_map rml:returnMap ?output_map . + ?output_map rml:constant ?func_output . + } + OPTIONAL { ?_object_map rml:language ?language .} + OPTIONAL {?_object_map rml:languageMap ?language_map. + OPTIONAL {?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:constant ?language_value.} + OPTIONAL {?language_map rml:logicalTarget ?language_output . + ?language_output rml:target ?language_dump. + ?language_dump void:dataDump ?language_dump.} + } + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + OPTIONAL {?datatype_map rml:template ?datatype_value.} + OPTIONAL {?datatype_map rml:constant ?datatype_value.} + OPTIONAL {?datatype_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?datatype_dump.} + } + OPTIONAL {?_object_map rml:termType ?term .} + + } OPTIONAL {?_predicate_object_map rml:graph ?predicate_object_graph .} OPTIONAL { ?_predicate_object_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:constant ?predicate_object_graph . } - OPTIONAL { ?_predicate_object_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:template ?predicate_object_graph . } + OPTIONAL {?_graph_structure rml:template ?predicate_object_graph .} + OPTIONAL {?_graph_structure rml:constant ?predicate_object_graph .} + OPTIONAL {?_graph_structure rml:logicalTarget ?po_graph_output . + ?po_graph_output rml:target ?po_graph_dump. + ?po_graph_dump void:dataDump ?object_graph_dump.} + } + OPTIONAL { ?_object_map rml:logicalTarget ?obj_output. + ?obj_output rml:target ?obj_dump. + ?obj_dump void:dataDump ?object_dump.} } OPTIONAL { ?_source a d2rq:Database; @@ -1391,11 +1645,13 @@ def mapping_parser(mapping_file): prefix d2rq: prefix td: prefix htv: - prefix hctl: + prefix hctl: + prefix fnml: SELECT DISTINCT * WHERE { # Subject ------------------------------------------------------------------------- + OPTIONAL{?triples_map_id a ?mappings_type} ?triples_map_id rml:logicalSource ?_source . OPTIONAL{?_source rml:source ?data_source .} OPTIONAL{ @@ -1408,50 +1664,59 @@ def mapping_parser(mapping_file): OPTIONAL { ?_source rr:tableName ?tablename .} OPTIONAL { ?_source rml:query ?query .} - ?triples_map_id rr:subjectMap ?_subject_map . + OPTIONAL {?triples_map_id rr:subjectMap ?_subject_map .} + OPTIONAL {?triples_map_id rml:subjectMap ?_subject_map .} OPTIONAL {?_subject_map rr:template ?subject_template .} OPTIONAL {?_subject_map rml:reference ?subject_reference .} OPTIONAL {?_subject_map rr:constant ?subject_constant} + OPTIONAL {?_subject_map rml:quotedTriplesMap ?subject_quoted . + OPTIONAL { + ?_subject_map rr:joinCondition ?join_condition . + ?join_condition rr:child ?subject_child_value; + rr:parent ?subject_parent_value. + } + } OPTIONAL { ?_subject_map rr:class ?rdf_class . } OPTIONAL { ?_subject_map rr:termType ?termtype . } OPTIONAL { ?_subject_map rr:graph ?graph . } OPTIONAL { ?_subject_map rr:graphMap ?_graph_structure . ?_graph_structure rr:constant ?graph . } OPTIONAL { ?_subject_map rr:graphMap ?_graph_structure . - ?_graph_structure rr:template ?graph . } + ?_graph_structure rr:template ?graph . } + OPTIONAL {?_subject_map fnml:functionValue ?subject_function .} # Predicate ----------------------------------------------------------------------- OPTIONAL { ?triples_map_id rr:predicateObjectMap ?_predicate_object_map . - + OPTIONAL {?_predicate_object_map rr:predicateMap ?_predicate_map .} + OPTIONAL {?_predicate_object_map rml:predicateMap ?_predicate_map .} OPTIONAL { - ?triples_map_id rr:predicateObjectMap ?_predicate_object_map . - ?_predicate_object_map rr:predicateMap ?_predicate_map . ?_predicate_map rr:constant ?predicate_constant . } OPTIONAL { - ?_predicate_object_map rr:predicateMap ?_predicate_map . ?_predicate_map rr:template ?predicate_template . } OPTIONAL { - ?_predicate_object_map rr:predicateMap ?_predicate_map . ?_predicate_map rml:reference ?predicate_reference . } OPTIONAL { ?_predicate_object_map rr:predicate ?predicate_constant_shortcut . - } + } + OPTIONAL { + ?_predicate_map fnml:functionValue ?predicate_function . + } # Object -------------------------------------------------------------------------- + OPTIONAL {?_predicate_object_map rr:objectMap ?_object_map .} + OPTIONAL {?_predicate_object_map rml:objectMap ?_object_map .} OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . ?_object_map rr:constant ?object_constant . OPTIONAL { ?_object_map rr:datatype ?object_datatype . } } OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . ?_object_map rr:template ?object_template . OPTIONAL {?_object_map rr:termType ?term .} OPTIONAL {?_object_map rml:languageMap ?language_map. @@ -1461,29 +1726,47 @@ def mapping_parser(mapping_file): } } OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . ?_object_map rml:reference ?object_reference . OPTIONAL { ?_object_map rr:language ?language .} OPTIONAL {?_object_map rml:languageMap ?language_map. ?language_map rml:reference ?language_value.} + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + ?datatype_map rml:template ?datatype_value.} OPTIONAL {?_object_map rr:termType ?term .} OPTIONAL { ?_object_map rr:datatype ?object_datatype . } } OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . ?_object_map rr:parentTriplesMap ?object_parent_triples_map . OPTIONAL { ?_object_map rr:joinCondition ?join_condition . ?join_condition rr:child ?child_value; rr:parent ?parent_value. + OPTIONAL{?parent_value fnml:functionValue ?parent_function.} + OPTIONAL{?child_value fnml:functionValue ?child_function.} OPTIONAL {?_object_map rr:termType ?term .} } + OPTIONAL { + ?_object_map rr:joinCondition ?join_condition . + ?join_condition rr:child ?child_value; + rr:parent ?parent_value; + } } OPTIONAL { ?_predicate_object_map rr:object ?object_constant_shortcut . } + OPTIONAL { + ?_predicate_object_map rml:object ?object_constant_shortcut . + } + OPTIONAL { + ?_object_map rml:quotedTriplesMap ?object_quoted . + OPTIONAL { + ?_object_map rr:joinCondition ?join_condition . + ?join_condition rr:child ?child_value; + rr:parent ?parent_value. + } + } OPTIONAL {?_predicate_object_map rr:graph ?predicate_object_graph .} OPTIONAL { ?_predicate_object_map rr:graphMap ?_graph_structure . ?_graph_structure rr:constant ?predicate_object_graph . } @@ -1499,49 +1782,143 @@ def mapping_parser(mapping_file): } } """ - mapping_query_results = mapping_graph.query(mapping_query) triples_map_list = [] + func_map_list = [] + if new_formulation == "yes": + mapping_query_results = mapping_graph.query(function_query) + for result_triples_map in mapping_query_results: + if result_triples_map.function_id != None: + func_map_exists = False + for func_map in func_map_list: + func_map_exists = func_map_exists or ( + str(func_map.func_map_id) == str(result_triples_map.function_id)) + if not func_map_exists: + parameters = {} + if result_triples_map.param != None: + if str(result_triples_map.param) not in parameters: + if result_triples_map.input_value != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.input_value), + "type":"constant"} + elif result_triples_map.param_reference != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_reference), + "type":"reference"} + elif result_triples_map.param_template != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_template), + "type":"template"} + elif result_triples_map.param_func != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_func), + "type":"function"} + func_map = tm.FunctionMap(str(result_triples_map.function_id),str(result_triples_map.function),parameters) + func_map_list.append(func_map) + else: + for func_map in func_map_list: + if str(func_map.func_map_id) == str(result_triples_map.function_id): + if result_triples_map.param != None: + if str(result_triples_map.param) not in func_map.parameters: + if result_triples_map.input_value != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.input_value), + "type":"constant"} + elif result_triples_map.param_reference != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_reference), + "type":"reference"} + elif result_triples_map.param_template != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_template), + "type":"template"} + elif result_triples_map.param_func != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_func), + "type":"function"} + mapping_query_results = mapping_graph.query(mapping_query) for result_triples_map in mapping_query_results: triples_map_exists = False for triples_map in triples_map_list: triples_map_exists = triples_map_exists or ( str(triples_map.triples_map_id) == str(result_triples_map.triples_map_id)) - if not triples_map_exists: if result_triples_map.subject_template != None: if result_triples_map.rdf_class is None: reference, condition = string_separetion(str(result_triples_map.subject_template)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template", + subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template","None","None", [result_triples_map.rdf_class], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") else: reference, condition = string_separetion(str(result_triples_map.subject_template)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template", + subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template","None","None", [str(result_triples_map.rdf_class)], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") elif result_triples_map.subject_reference != None: if result_triples_map.rdf_class is None: reference, condition = string_separetion(str(result_triples_map.subject_reference)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference", + subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference","None","None", [result_triples_map.rdf_class], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") else: reference, condition = string_separetion(str(result_triples_map.subject_reference)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference", + subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference","None","None", [str(result_triples_map.rdf_class)], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") elif result_triples_map.subject_constant != None: if result_triples_map.rdf_class is None: reference, condition = string_separetion(str(result_triples_map.subject_constant)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant", + subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant","None","None", [result_triples_map.rdf_class], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") else: reference, condition = string_separetion(str(result_triples_map.subject_constant)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant", + subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant","None","None", [str(result_triples_map.rdf_class)], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") + elif result_triples_map.subject_function != None: + func_output = "None" + if result_triples_map.subject_output != None: + if "#" in result_triples_map.subject_output: + func_output = result_triples_map.subject_output.split("#")[1] + else: + func_output = result_triples_map.subject_output.split("/")[len(result_triples_map.subject_output.split("/"))-1] + if result_triples_map.rdf_class is None: + reference, condition = string_separetion(str(result_triples_map.subject_constant)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_function), condition, "function","None","None", + [str(result_triples_map.rdf_class)], result_triples_map.termtype, + [result_triples_map.graph],func_output) + else: + reference, condition = string_separetion(str(result_triples_map.subject_constant)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_function), condition, "function","None","None","None","None", + [str(result_triples_map.rdf_class)], result_triples_map.termtype, + [result_triples_map.graph],func_output) + elif result_triples_map.subject_quoted != None: + if result_triples_map.rdf_class is None: + reference, condition = string_separetion(str(result_triples_map.subject_quoted)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_quoted), condition, "quoted triples map", + result_triples_map.subject_parent_value, result_triples_map.subject_child_value, + [result_triples_map.rdf_class], result_triples_map.termtype, + [result_triples_map.graph],"None") + else: + reference, condition = string_separetion(str(result_triples_map.subject_quoted)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_quoted), condition, "quoted triples map", + result_triples_map.subject_parent_value, result_triples_map.subject_child_value, + [str(result_triples_map.rdf_class)], result_triples_map.termtype, + [result_triples_map.graph],"None") + + if new_formulation == "yes": + output_file = "" + if result_triples_map.subject_dump != None: + output_file = result_triples_map.subject_dump[7:] if result_triples_map.subject_dump[:7] == "file://" else result_triples_map.subject_dump + elif result_triples_map.subject_graph_dump != None: + output_file = result_triples_map.subject_graph_dump[7:] if result_triples_map.subject_graph_dump[:7] == "file://" else result_triples_map.subject_graph_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:"subject"} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = "subject" mapping_query_prepared = prepareQuery(mapping_query) @@ -1551,65 +1928,213 @@ def mapping_parser(mapping_file): join_predicate = {} predicate_object_maps_list = [] predicate_object_graph = {} + + function = False for result_predicate_object_map in mapping_query_prepared_results: join = True if result_predicate_object_map.predicate_constant != None: - predicate_map = tm.PredicateMap("constant", str(result_predicate_object_map.predicate_constant), "") + predicate_map = tm.PredicateMap("constant", str(result_predicate_object_map.predicate_constant), "", "None") predicate_object_graph[ str(result_predicate_object_map.predicate_constant)] = result_triples_map.predicate_object_graph elif result_predicate_object_map.predicate_constant_shortcut != None: predicate_map = tm.PredicateMap("constant shortcut", - str(result_predicate_object_map.predicate_constant_shortcut), "") + str(result_predicate_object_map.predicate_constant_shortcut), "", "None") predicate_object_graph[ str(result_predicate_object_map.predicate_constant_shortcut)] = result_triples_map.predicate_object_graph elif result_predicate_object_map.predicate_template != None: template, condition = string_separetion(str(result_predicate_object_map.predicate_template)) - predicate_map = tm.PredicateMap("template", template, condition) + predicate_map = tm.PredicateMap("template", template, condition, "None") elif result_predicate_object_map.predicate_reference != None: reference, condition = string_separetion(str(result_predicate_object_map.predicate_reference)) - predicate_map = tm.PredicateMap("reference", reference, condition) + predicate_map = tm.PredicateMap("reference", reference, condition, "None") + elif result_predicate_object_map.predicate_function != None: + func_output = "None" + if result_predicate_object_map.predicate_output != None: + if "#" in result_predicate_object_map.predicate_output: + func_output = result_predicate_object_map.predicate_output.split("#")[1] + else: + func_output = result_predicate_object_map.predicate_output.split("/")[len(result_predicate_object_map.predicate_output.split("/"))-1] + predicate_map = tm.PredicateMap("function", str(result_predicate_object_map.predicate_function),"",func_output) else: - predicate_map = tm.PredicateMap("None", "None", "None") + predicate_map = tm.PredicateMap("None", "None", "None", "None") + + if new_formulation == "yes": + if result_predicate_object_map.predicate_dump != None: + output_file = result_predicate_object_map.predicate_dump[7:] if result_predicate_object_map.predicate_dump[:7] == "file://" else result_predicate_object_map.predicate_dump + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:[predicate_map.value]} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [predicate_map.value] + else: + if predicate_map.value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(predicate_map.value) + + if "execute" in predicate_map.value: + function = True if result_predicate_object_map.object_constant != None: object_map = tm.ObjectMap("constant", str(result_predicate_object_map.object_constant), str(result_predicate_object_map.object_datatype), "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_template != None: object_map = tm.ObjectMap("template", str(result_predicate_object_map.object_template), str(result_predicate_object_map.object_datatype), "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_reference != None: object_map = tm.ObjectMap("reference", str(result_predicate_object_map.object_reference), str(result_predicate_object_map.object_datatype), "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_parent_triples_map != None: - if predicate_map.value + " " + str( - result_predicate_object_map.object_parent_triples_map) not in join_predicate: - join_predicate[ - predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { - "predicate": predicate_map, "childs": [str(result_predicate_object_map.child_value)], - "parents": [str(result_predicate_object_map.parent_value)], - "triples_map": str(result_predicate_object_map.object_parent_triples_map)} + if predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map) not in join_predicate: + if (result_predicate_object_map.child_function is None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_value)], + "parents":[str(result_predicate_object_map.parent_function)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_function)], + "parents":[str(result_predicate_object_map.parent_value)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_function)], + "parents":[str(result_predicate_object_map.parent_function)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} + else: + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_value)], + "parents":[str(result_predicate_object_map.parent_value)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} else: - join_predicate[ - predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ - "childs"].append(str(result_predicate_object_map.child_value)) - join_predicate[ - predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ - "parents"].append(str(result_predicate_object_map.parent_value)) + if (result_predicate_object_map.child_function is None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_function)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_value)) + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_function)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_value)) + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_function)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_function)) + else: + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_value)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_value)) join = False + elif result_predicate_object_map.function is not None: + func_output = "None" + if result_predicate_object_map.func_output != None: + if "#" in result_predicate_object_map.func_output: + func_output = result_predicate_object_map.func_output.split("#")[1] + else: + func_output = result_predicate_object_map.func_output.split("/")[len(result_predicate_object_map.func_output.split("/"))-1] + object_map = tm.ObjectMap("reference function", str(result_predicate_object_map.function), + str(result_predicate_object_map.object_datatype), "None", "None", + result_predicate_object_map.term, result_predicate_object_map.language, + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, func_output) + elif result_predicate_object_map.object_quoted != None: + object_map = tm.ObjectMap("quoted triples map", str(result_predicate_object_map.object_quoted), + str(result_predicate_object_map.object_datatype), + [str(result_predicate_object_map.child_value)], [str(result_predicate_object_map.parent_value)], + result_predicate_object_map.term, result_predicate_object_map.language, + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_constant_shortcut != None: object_map = tm.ObjectMap("constant shortcut", str(result_predicate_object_map.object_constant_shortcut), "None", "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") else: - object_map = tm.ObjectMap("None", "None", "None", "None", "None", "None", "None", "None") + object_map = tm.ObjectMap("None", "None", "None", "None", "None", "None", "None", "None", "None", "None") + + if new_formulation == "yes": + output_file = "" + if result_predicate_object_map.object_dump != None: + output_file = result_predicate_object_map.object_dump[7:] if result_predicate_object_map.object_dump[:7] == "file://" else result_predicate_object_map.object_dump + elif result_predicate_object_map.language_dump != None: + output_file = result_predicate_object_map.language_dump[7:] if result_predicate_object_map.language_dump[:7] == "file://" else result_predicate_object_map.language_dump + elif result_predicate_object_map.datatype_dump != None: + output_file = result_predicate_object_map.datatype_dump[7:] if result_predicate_object_map.datatype_dump[:7] == "file://" else result_predicate_object_map.datatype_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:[object_map.value]} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + if result_predicate_object_map.language_dump != None: + if result_predicate_object_map.language != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value + "_" + result_predicate_object_map.language] + elif result_predicate_object_map.language_value != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value + "_" + result_predicate_object_map.language_value] + elif result_predicate_object_map.datatype_dump != None: + if result_predicate_object_map.object_datatype != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [str(object_map.value + "_" + result_predicate_object_map.object_datatype)] + elif result_predicate_object_map.datatype_value != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [str(object_map.value + "_" + result_predicate_object_map.datatype_value)] + else: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value] + else: + if result_predicate_object_map.language_dump != None: + if result_predicate_object_map.language != None: + if result_predicate_object_map.language_value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value + "_" + result_predicate_object_map.language) + elif result_predicate_object_map.language_value != None: + if result_predicate_object_map.language_value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value + "_" + result_predicate_object_map.language_value) + elif result_predicate_object_map.datatype_dump != None: + if result_predicate_object_map.object_datatype != None: + if str(object_map.value + "_" + result_predicate_object_map.object_datatype) not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(str(object_map.value + "_" + result_predicate_object_map.object_datatype)) + elif result_predicate_object_map.datatype_value != None: + if str(object_map.value + "_" + result_predicate_object_map.datatype_value) not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(str(object_map.value + "_" + result_predicate_object_map.datatype_value)) + else: + if object_map.value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value) + if result_predicate_object_map.object_graph_dump != None: + output_file = result_predicate_object_map.object_graph_dump[7:] if result_predicate_object_map.object_graph_dump[:7] == "file://" else result_predicate_object_map.object_graph_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:[object_map.value]} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value] + else: + if object_map.value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value) + if join: predicate_object_maps_list += [ tm.PredicateObjectMap(predicate_map, object_map, predicate_object_graph)] @@ -1620,18 +2145,34 @@ def mapping_parser(mapping_file): str(result_predicate_object_map.object_datatype), join_predicate[jp]["childs"], join_predicate[jp]["parents"], result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") predicate_object_maps_list += [ tm.PredicateObjectMap(join_predicate[jp]["predicate"], object_map, predicate_object_graph)] - if result_triples_map.url_source is not None: - current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), - str(result_triples_map.url_source), subject_map, - predicate_object_maps_list, - ref_form=str(result_triples_map.ref_form), - iterator=str(result_triples_map.iterator), - tablename=str(result_triples_map.tablename), - query=str(result_triples_map.query)) + if result_triples_map.delimiter is not None: + url_source = str(result_triples_map.url_source)[7:] if str(result_triples_map.url_source)[:7] == "file://" else str(result_triples_map.url_source) + delimiter[url_source] = str(result_triples_map.delimiter) + if ".xml" in str(result_triples_map.url_source) and str(result_triples_map.ref_form) != "http://w3id.org/rml/XPath": + current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), + str(result_triples_map.url_source), subject_map, + predicate_object_maps_list, + ref_form="http://w3id.org/rml/XPath", + iterator=str(result_triples_map.iterator), + tablename=str(result_triples_map.tablename), + query=str(result_triples_map.query), + function=function,func_map_list=func_map_list, + mappings_type=str(result_triples_map.mappings_type)) + else: + current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), + str(result_triples_map.url_source), subject_map, + predicate_object_maps_list, + ref_form=str(result_triples_map.ref_form), + iterator=str(result_triples_map.iterator), + tablename=str(result_triples_map.tablename), + query=str(result_triples_map.query), + function=function,func_map_list=func_map_list, + mappings_type=str(result_triples_map.mappings_type)) else: current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), str(result_triples_map.data_source), subject_map, @@ -1639,7 +2180,10 @@ def mapping_parser(mapping_file): ref_form=str(result_triples_map.ref_form), iterator=str(result_triples_map.iterator), tablename=str(result_triples_map.tablename), - query=str(result_triples_map.query)) + query=str(result_triples_map.query), + function=function,func_map_list=func_map_list, + mappings_type=str(result_triples_map.mappings_type)) + triples_map_list += [current_triples_map] else: @@ -1650,6 +2194,66 @@ def mapping_parser(mapping_file): if result_triples_map.graph not in triples_map.subject_map.graph: triples_map.graph.append(result_triples_map.graph) + if new_formulation == "yes": + output_file = "" + if result_triples_map.subject_dump != None: + output_file = result_triples_map.subject_dump[7:] if result_triples_map.subject_dump[:7] == "file://" else result_triples_map.subject_dump + elif result_triples_map.subject_graph_dump != None: + output_file = result_triples_map.subject_graph_dump[7:] if result_triples_map.subject_graph_dump[:7] == "file://" else result_triples_map.subject_graph_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:"subject"} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = "subject" + + if result_triples_map.predicate_constant_shortcut != None: + for po in triples_map.predicate_object_maps_list: + if po.predicate_map.value == str(result_triples_map.predicate_constant_shortcut): + if str(result_triples_map.predicate_constant_shortcut) in po.graph: + po.graph[str(result_triples_map.predicate_constant_shortcut)] = result_triples_map.predicate_object_graph + + if new_formulation == "yes": + output_file = "" + if result_triples_map.predicate_dump != None: + if result_triples_map.predicate_constant != None: + value = result_triples_map.predicate_constant + elif result_triples_map.predicate_template != None: + value = result_triples_map.predicate_template + elif result_triples_map.predicate_reference != None: + value = result_triples_map.predicate_reference + output_file = result_triples_map.predicate_dump[7:] if result_triples_map.predicate_dump[:7] == "file://" else result_triples_map.predicate_dump + + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:value} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = value + + output_file = "" + if result_triples_map.object_dump != None: + output_file = result_triples_map.object_dump[7:] if result_triples_map.object_dump[:7] == "file://" else result_triples_map.object_dump + elif result_triples_map.object_graph_dump != None: + output_file = result_triples_map.object_graph_dump[7:] if result_triples_map.object_graph_dump[:7] == "file://" else result_triples_map.object_graph_dump + elif result_triples_map.language_dump != None: + output_file = result_triples_map.language_dump[7:] if result_triples_map.language_dump[:7] == "file://" else result_triples_map.language_dump + elif result_triples_map.datatype_dump != None: + output_file = result_triples_map.datatype_dump[7:] if result_triples_map.datatype_dump[:7] == "file://" else result_triples_map.datatype_dump + if output_file != "": + if result_triples_map.object_constant != None: + value = result_triples_map.object_constant + elif result_triples_map.object_reference != None: + value = result_triples_map.object_reference + elif result_triples_map.object_template != None: + value = result_triples_map.object_template + elif result_triples_map.object_parent_triples_map != None: + value = result_triples_map.object_parent_triples_map + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:value} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = value + return mappings_expansion(triples_map_list) @@ -1662,75 +2266,116 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): object_list = [] global blank_message global host, port, user, password, datab - with open(str(triples_map.data_source), "r") as input_file_descriptor: - tree = ET.parse(input_file_descriptor) + if "http" in triples_map.data_source: + response = requests.get(triples_map.data_source, stream=True) + root = ET.fromstring(response.content) + else: + tree = ET.parse(triples_map.data_source) root = tree.getroot() - if "[" not in triples_map.iterator: - level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - 1] - else: - temp = triples_map.iterator.split("[")[0] - level = temp.split("/")[len(temp.split("/")) - 1] - parent_map = {c: p for p in tree.iter() for c in p} - namespace = dict([node for _, node in ET.iterparse(str(triples_map.data_source), events=['start-ns'])]) - if namespace: - for name in namespace: - ET.register_namespace(name, namespace[name]) - if "/" in triples_map.iterator: - parent_level = 2 - while len(list(root.iterfind(level, namespace))) == 0: - if triples_map.iterator != level: - level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - parent_level] + "/" + level - parent_level += 1 - else: + if "[" not in triples_map.iterator: + level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - 1] + if level == "": + i = 1 + while i < len(triples_map.iterator.split("/")) - 1: + level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - i] + if level != "": break - else: - level = "." - if mapping_partitions == "yes": - if triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant" or \ - triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant shortcut": - predicate = "<" + triples_map.predicate_object_maps_list[0].predicate_map.value + ">" - constant_predicate = False + i += 1 + else: + temp = triples_map.iterator.split("[")[0] + level = temp.split("/")[len(temp.split("/")) - 1] + if level == "": + i = 1 + while i < len(temp.split("/")) - 1: + level = temp.split("/")[len(temp.split("/")) - i] + if level != "": + break + i += 1 + parent_map = {c: p for p in root.iter() for c in p} + if "http" in triples_map.data_source: + namespace = {} + for elem in root.iter(): + namespace_uri = elem.tag.split('}')[0][1:] + if namespace_uri and ':' in elem.tag: + prefix = elem.tag.split(':')[0] + namespace[prefix] = namespace_uri + else: + namespace = dict([node for _, node in ET.iterparse(str(triples_map.data_source), events=['start-ns'])]) + if namespace: + for name in namespace: + ET.register_namespace(name, namespace[name]) + if "/" in triples_map.iterator: + parent_level = 2 + while len(list(root.iterfind(level, namespace))) == 0: + if triples_map.iterator != level: + level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - parent_level] + "/" + level + parent_level += 1 else: - predicate = None - constant_predicate = True + break + else: + level = "." + if mapping_partitions == "yes": + if triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant" or \ + triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant shortcut": + predicate = "<" + triples_map.predicate_object_maps_list[0].predicate_map.value + ">" + constant_predicate = False else: predicate = None constant_predicate = True - for child in root.iterfind(level, namespace): - create_subject = True - global generated_subjects + else: + predicate = None + constant_predicate = True + for child in root.iterfind(level, namespace): + create_subject = True + global generated_subjects - if mapping_partitions == "yes": - if "_" in triples_map.triples_map_id: - componets = triples_map.triples_map_id.split("_")[:-1] - triples_map_id = "" - for name in componets: - triples_map_id += name + "_" - triples_map_id = triples_map_id[:-1] - else: - triples_map_id = triples_map.triples_map_id + if mapping_partitions == "yes": + if "_" in triples_map.triples_map_id: + componets = triples_map.triples_map_id.split("_")[:-1] + triples_map_id = "" + for name in componets: + triples_map_id += name + "_" + triples_map_id = triples_map_id[:-1] + else: + triples_map_id = triples_map.triples_map_id - subject_attr = extract_subject_values(child, generated_subjects[triples_map_id]["subject_attr"], "XML", - parent_map) + subject_attr = extract_subject_values(child, generated_subjects[triples_map_id]["subject_attr"], "XML", + parent_map) - if subject_attr == None: - subject = None - create_subject = False - else: - if triples_map_id in generated_subjects: - if subject_attr in generated_subjects[triples_map_id]: - subject = generated_subjects[triples_map_id][subject_attr] - create_subject = False - - if create_subject: - subject_value = string_substitution_xml(triples_map.subject_map.value, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) - if triples_map.subject_map.subject_mapping_type == "template": - if triples_map.subject_map.term_type is None: - if triples_map.subject_map.condition == "": + if subject_attr == None: + subject = None + create_subject = False + else: + if triples_map_id in generated_subjects: + if subject_attr in generated_subjects[triples_map_id]: + subject = generated_subjects[triples_map_id][subject_attr] + create_subject = False + + if create_subject: + subject_value = string_substitution_xml(triples_map.subject_map.value, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + if triples_map.subject_map.subject_mapping_type == "template": + if triples_map.subject_map.term_type is None: + if triples_map.subject_map.condition == "": + + try: + subject = "<" + subject_value + ">" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "<" + subject_value + ">" + except: + subject = None + else: + if "IRI" in triples_map.subject_map.term_type: + if triples_map.subject_map.condition == "": try: - subject = "<" + subject_value + ">" + subject = "<" + base + subject_value + ">" except: subject = None @@ -1738,398 +2383,488 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): # field, condition = condition_separetor(triples_map.subject_map.condition) # if row[field] == condition: try: - subject = "<" + subject_value + ">" - except: - subject = None - else: - if "IRI" in triples_map.subject_map.term_type: - if triples_map.subject_map.condition == "": - - try: - subject = "<" + base + subject_value + ">" - except: - subject = None - - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - if "http" not in subject_value: + if "http" not in subject_value: + if base != "": subject = "<" + base + subject_value + ">" else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + if is_valid_url_syntax(subject_value): subject = "<" + subject_value + ">" - except: - subject = None - - elif "BlankNode" in triples_map.subject_map.term_type: - if triples_map.subject_map.condition == "": - - try: - if "/" in subject_value: - subject = "_:" + encode_char(subject_value.replace("/", "2F")).replace("%", "") - if blank_message: - logger.warning( - "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") - blank_message = False else: - subject = "_:" + encode_char(subject_value).replace("%", "") - if "." in subject: - subject = subject.replace(".", "2E") - except: - subject = None + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + except: + subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - subject = "_:" + subject_value - except: - subject = None + elif "BlankNode" in triples_map.subject_map.term_type: + if triples_map.subject_map.condition == "": - elif "Literal" in triples_map.subject_map.term_type: - subject = None + try: + if "/" in subject_value: + subject = "_:" + encode_char(subject_value.replace("/", "2F")).replace("%", "") + if blank_message: + logger.warning( + "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + subject = "_:" + encode_char(subject_value).replace("%", "") + if "." in subject: + subject = subject.replace(".", "2E") + except: + subject = None else: - if triples_map.subject_map.condition == "": + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "_:" + subject_value + except: + subject = None - try: - subject = "<" + subject_value + ">" - except: - subject = None + elif "Literal" in triples_map.subject_map.term_type: + subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - subject = "<" + subject_value + ">" - except: - subject = None + else: + if triples_map.subject_map.condition == "": - elif "reference" in triples_map.subject_map.subject_mapping_type: - if triples_map.subject_map.condition == "": - subject_value = string_substitution_xml(triples_map.subject_map.value, ".+", child, "subject", - triples_map.iterator, parent_map, namespace) - subject_value = subject_value[0][1:-1] - try: - if " " not in subject_value: - if "http" not in subject_value: + try: + subject = "<" + subject_value + ">" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "<" + subject_value + ">" + except: + subject = None + + elif "reference" in triples_map.subject_map.subject_mapping_type: + if triples_map.subject_map.condition == "": + subject_value = string_substitution_xml(triples_map.subject_map.value, ".+", child, "subject", + triples_map.iterator, parent_map, namespace) + subject_value = subject_value[0][1:-1] + try: + if " " not in subject_value: + if "http" not in subject_value: + if base != "": subject = "<" + base + subject_value + ">" else: - subject = "<" + subject_value + ">" + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - logger.error(" is an invalid URL") - subject = None - except: + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + logger.error(" is an invalid URL") + subject = None + except: + subject = None + if triples_map.subject_map.term_type == "IRI": + if " " not in subject_value: + subject = "<" + encode_char(subject_value) + ">" + else: subject = None - if triples_map.subject_map.term_type == "IRI": - if " " not in subject_value: - subject = "<" + encode_char(subject_value) + ">" - else: - subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - if "http" not in subject_value: + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + if "http" not in subject_value: + if base != "": subject = "<" + base + subject_value + ">" else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + if is_valid_url_syntax(subject_value): subject = "<" + subject_value + ">" - except: - subject = None + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + except: + subject = None - elif "constant" in triples_map.subject_map.subject_mapping_type: - subject = "<" + triples_map.subject_map.value + ">" + elif "constant" in triples_map.subject_map.subject_mapping_type: + subject = "<" + triples_map.subject_map.value + ">" - else: - if triples_map.subject_map.condition == "": + else: + if triples_map.subject_map.condition == "": - try: - subject = "\"" + triples_map.subject_map.value + "\"" - except: - subject = None + try: + subject = "\"" + triples_map.subject_map.value + "\"" + except: + subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - subject = "\"" + triples_map.subject_map.value + "\"" - except: - subject = None + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "\"" + triples_map.subject_map.value + "\"" + except: + subject = None - if mapping_partitions == "yes": - if triples_map_id in generated_subjects: - if subject_attr in generated_subjects[triples_map_id]: - pass - else: - generated_subjects[triples_map_id][subject_attr] = subject + if mapping_partitions == "yes": + if triples_map_id in generated_subjects: + if subject_attr in generated_subjects[triples_map_id]: + pass else: - generated_subjects[triples_map_id] = {subject_attr: subject} + generated_subjects[triples_map_id][subject_attr] = subject + else: + generated_subjects[triples_map_id] = {subject_attr: subject} - if triples_map.subject_map.rdf_class != [None] and subject != None: - predicate = "" - for rdf_class in triples_map.subject_map.rdf_class: - if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): - obj = "<{}>".format(rdf_class) - dictionary_table_update(subject) - dictionary_table_update(obj) - dictionary_table_update(predicate + "_" + obj) - rdf_type = subject + " " + predicate + " " + obj + ".\n" - for graph in triples_map.subject_map.graph: - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - rdf_type = rdf_type[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, - "subject", - triples_map.iterator, - parent_map, - namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if dic_table[predicate + "_" + obj] not in g_triples: - output_file_descriptor.write(rdf_type) - g_triples.update({dic_table[predicate + "_" + obj]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + obj]]: - output_file_descriptor.write(rdf_type) - g_triples[dic_table[predicate + "_" + obj]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 + if triples_map.subject_map.rdf_class != [None] and subject != None: + predicate = "" + for rdf_class in triples_map.subject_map.rdf_class: + if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): + obj = "<{}>".format(rdf_class) + dictionary_table_update(subject) + dictionary_table_update(obj) + dictionary_table_update(predicate + "_" + obj) + rdf_type = subject + " " + predicate + " " + obj + ".\n" + for graph in triples_map.subject_map.graph: + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + rdf_type = rdf_type[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, + "subject", + triples_map.iterator, + parent_map, + namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") else: + rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if duplicate == "yes": + if dic_table[predicate + "_" + obj] not in g_triples: output_file_descriptor.write(rdf_type) + g_triples.update({dic_table[predicate + "_" + obj]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + obj]]: + output_file_descriptor.write(rdf_type) + g_triples[dic_table[predicate + "_" + obj]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 - for predicate_object_map in triples_map.predicate_object_maps_list: - if constant_predicate: - if predicate_object_map.predicate_map.mapping_type == "constant" or predicate_object_map.predicate_map.mapping_type == "constant shortcut": - predicate = "<" + predicate_object_map.predicate_map.value + ">" - elif predicate_object_map.predicate_map.mapping_type == "template": - if predicate_object_map.predicate_map.condition != "": - # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) - # if row[field] == condition: - try: - predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, - "{(.+?)}", child, "predicate", - triples_map.iterator, parent_map, - namespace) + ">" - except: - predicate = None - # else: - # predicate = None - else: - try: - predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, - "{(.+?)}", child, "predicate", - triples_map.iterator, parent_map, - namespace) + ">" - except: - predicate = None - elif predicate_object_map.predicate_map.mapping_type == "reference": - if predicate_object_map.predicate_map.condition != "": - # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) - # if row[field] == condition: - predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, - "predicate", triples_map.iterator, parent_map, - namespace) - # else: - # predicate = None else: - predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, - "predicate", triples_map.iterator, parent_map, - namespace) - predicate = "<" + predicate[1:-1] + ">" + output_file_descriptor.write(rdf_type) + i += 1 + for predicate_object_map in triples_map.predicate_object_maps_list: + if constant_predicate: + if predicate_object_map.predicate_map.mapping_type == "constant" or predicate_object_map.predicate_map.mapping_type == "constant shortcut": + predicate = "<" + predicate_object_map.predicate_map.value + ">" + elif predicate_object_map.predicate_map.mapping_type == "template": + if predicate_object_map.predicate_map.condition != "": + # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) + # if row[field] == condition: + try: + predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, + "{(.+?)}", child, "predicate", + triples_map.iterator, parent_map, + namespace) + ">" + except: + predicate = None + # else: + # predicate = None else: - predicate = None - - if predicate_object_map.object_map.mapping_type == "constant" or predicate_object_map.object_map.mapping_type == "constant shortcut": - if "/" in predicate_object_map.object_map.value: - object = "<" + predicate_object_map.object_map.value + ">" + try: + predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, + "{(.+?)}", child, "predicate", + triples_map.iterator, parent_map, + namespace) + ">" + except: + predicate = None + elif predicate_object_map.predicate_map.mapping_type == "reference": + if predicate_object_map.predicate_map.condition != "": + # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) + # if row[field] == condition: + predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, + "predicate", triples_map.iterator, parent_map, + namespace) + # else: + # predicate = None else: - object = "\"" + predicate_object_map.object_map.value + "\"" - if predicate_object_map.object_map.datatype != None: - object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) - elif predicate_object_map.object_map.mapping_type == "template": - object = string_substitution_xml(predicate_object_map.object_map.value, "{(.+?)}", child, "object", - triples_map.iterator, parent_map, namespace) - if isinstance(object, list): - for i in range(len(object)): - if predicate_object_map.object_map.term is None: - object[i] = "<" + object[i] + ">" - elif "IRI" in predicate_object_map.object_map.term: - object[i] = "<" + object[i] + ">" - else: - object[i] = "\"" + object[i] + "\"" - if predicate_object_map.object_map.datatype != None: - object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format( - predicate_object_map.object_map.datatype) - elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object[i] += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object[i] += "@en" - elif len(predicate_object_map.object_map.language) == 2: - object[i] += "@" + predicate_object_map.object_map.language - elif predicate_object_map.object_map.language_map != None: - lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", - child, "object", triples_map.iterator, parent_map, - namespace) - if lang != None: - object[i] += "@" + string_substitution_xml( - predicate_object_map.object_map.language_map, ".+", child, "object", - triples_map.iterator, parent_map, namespace)[1:-1] + predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, + "predicate", triples_map.iterator, parent_map, + namespace) + predicate = "<" + predicate[1:-1] + ">" + else: + predicate = None + + if predicate_object_map.object_map.mapping_type == "constant" or predicate_object_map.object_map.mapping_type == "constant shortcut": + if "/" in predicate_object_map.object_map.value: + object = "<" + predicate_object_map.object_map.value + ">" + else: + object = "\"" + predicate_object_map.object_map.value + "\"" + if predicate_object_map.object_map.datatype != None: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.mapping_type == "template": + object = string_substitution_xml(predicate_object_map.object_map.value, "{(.+?)}", child, "object", + triples_map.iterator, parent_map, namespace) + if isinstance(object, list): + for i in range(len(object)): if predicate_object_map.object_map.term is None: - object = "<" + object + ">" + object[i] = "<" + object[i] + ">" elif "IRI" in predicate_object_map.object_map.term: - object = "<" + object + ">" + object[i] = "<" + object[i] + ">" else: - object = "\"" + object + "\"" + object[i] = "\"" + object[i] + "\"" if predicate_object_map.object_map.datatype != None: - object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object += "@en" + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object[i] += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object[i] += "@en" elif len(predicate_object_map.object_map.language) == 2: - object += "@" + predicate_object_map.object_map.language + object[i] += "@" + predicate_object_map.object_map.language + else: + object[i] = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace) if lang != None: - object += "@" + string_substitution_xml( + object[i] += "@" + string_substitution_xml( predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace)[1:-1] - elif predicate_object_map.object_map.mapping_type == "reference": - object = string_substitution_xml(predicate_object_map.object_map.value, ".+", child, "object", - triples_map.iterator, parent_map, namespace) - if object != None: - if isinstance(object, list): - for i in range(len(object)): - if "\\" in object[i][1:-1]: - object = "\"" + object[i][1:-1].replace("\\", "\\\\") + "\"" - if "'" in object[i][1:-1]: - object = "\"" + object[i][1:-1].replace("'", "\\\\'") + "\"" - if "\"" in object[i][1:-1]: - object = "\"" + object[i][1:-1].replace("\"", "\\\"") + "\"" - if "\n" in object[i]: - object[i] = object[i].replace("\n", "\\n") - if predicate_object_map.object_map.datatype != None: - object[i] += "^^<{}>".format(predicate_object_map.object_map.datatype) - elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object[i] += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object[i] += "@en" - elif len(predicate_object_map.object_map.language) == 2: - object[i] += "@" + predicate_object_map.object_map.language - elif predicate_object_map.object_map.language_map != None: - lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", - child, "object", triples_map.iterator, parent_map, - namespace) - if lang != None: - object[i] += "@" + string_substitution_xml( - predicate_object_map.object_map.language_map, ".+", child, "object", - triples_map.iterator, parent_map, namespace)[1:-1] - elif predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - if " " not in object: - object[i] = "\"" + object[i][1:-1].replace("\\\\'", "'") + "\"" - object[i] = "<" + encode_char(object[i][1:-1]) + ">" - else: - object[i] = None - else: - if "\\" in object[1:-1]: - object = "\"" + object[1:-1].replace("\\", "\\\\") + "\"" - if "'" in object[1:-1]: - object = "\"" + object[1:-1].replace("'", "\\\\'") + "\"" - if "\"" in object[1:-1]: - object = "\"" + object[1:-1].replace("\"", "\\\"") + "\"" - if "\n" in object: - object = object.replace("\n", "\\n") + else: + if predicate_object_map.object_map.term is None: + object = "<" + object + ">" + elif "IRI" in predicate_object_map.object_map.term: + object = "<" + object + ">" + else: + object = "\"" + object + "\"" + if predicate_object_map.object_map.datatype != None: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( + predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.language != None: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object += "@" + predicate_object_map.object_map.language + else: + object = None + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", + child, "object", triples_map.iterator, parent_map, + namespace) + if lang != None: + object += "@" + string_substitution_xml( + predicate_object_map.object_map.language_map, ".+", child, "object", + triples_map.iterator, parent_map, namespace)[1:-1] + elif predicate_object_map.object_map.mapping_type == "reference": + object = string_substitution_xml(predicate_object_map.object_map.value, ".+", child, "object", + triples_map.iterator, parent_map, namespace) + if object != None: + if isinstance(object, list): + for i in range(len(object)): + if "\\" in object[i][1:-1]: + object[i] = "\"" + object[i][1:-1].replace("\\", "\\\\") + "\"" + if "'" in object[i][1:-1]: + object[i] = "\"" + object[i][1:-1].replace("'", "\\\\'") + "\"" + if "\"" in object[i][1:-1]: + object[i] = "\"" + object[i][1:-1].replace("\"", "\\\"") + "\"" + if "\n" in object[i]: + object[i] = object[i].replace("\n", "\\n") if predicate_object_map.object_map.datatype != None: - object += "^^<{}>".format(predicate_object_map.object_map.datatype) + object[i] += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object += "@en" + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object[i] += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object[i] += "@en" elif len(predicate_object_map.object_map.language) == 2: - object += "@" + predicate_object_map.object_map.language + object[i] += "@" + predicate_object_map.object_map.language + else: + object[i] = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace) if lang != None: - object += "@" + string_substitution_xml( + object[i] += "@" + string_substitution_xml( predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace)[1:-1] elif predicate_object_map.object_map.term != None: if "IRI" in predicate_object_map.object_map.term: if " " not in object: - object = "\"" + object[1:-1].replace("\\\\'", "'") + "\"" - object = "<" + encode_char(object[1:-1]) + ">" + object[i] = "\"" + object[i][1:-1].replace("\\\\'", "'") + "\"" + object[i] = "<" + encode_char(object[i][1:-1]) + ">" else: - object = None - elif predicate_object_map.object_map.mapping_type == "parent triples map": - if subject != None: - for triples_map_element in triples_map_list: - if triples_map_element.triples_map_id == predicate_object_map.object_map.value: - if triples_map_element.data_source != triples_map.data_source: - if triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[ - 0] not in join_table: - if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": - if "http" in triples_map_element.data_source: - if triples_map_element.file_format == "JSONPath": - response = urlopen(triples_map_element.data_source) - data = json.loads(response.read()) + object[i] = None + else: + if "\\" in object[1:-1]: + object = "\"" + object[1:-1].replace("\\", "\\\\") + "\"" + if "'" in object[1:-1]: + object = "\"" + object[1:-1].replace("'", "\\\\'") + "\"" + if "\"" in object[1:-1]: + object = "\"" + object[1:-1].replace("\"", "\\\"") + "\"" + if "\n" in object: + object = object.replace("\n", "\\n") + if predicate_object_map.object_map.datatype != None: + object += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.language != None: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object += "@" + predicate_object_map.object_map.language + else: + object = None + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", + child, "object", triples_map.iterator, parent_map, + namespace) + if lang != None: + object += "@" + string_substitution_xml( + predicate_object_map.object_map.language_map, ".+", child, "object", + triples_map.iterator, parent_map, namespace)[1:-1] + elif predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + if " " not in object: + object = "\"" + object[1:-1].replace("\\\\'", "'") + "\"" + object = "<" + encode_char(object[1:-1]) + ">" + else: + object = None + elif predicate_object_map.object_map.mapping_type == "parent triples map": + if subject != None: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == predicate_object_map.object_map.value: + if triples_map_element.data_source != triples_map.data_source: + if triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[ + 0] not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + if "http" in triples_map_element.data_source: + if triples_map_element.file_format == "JSONPath": + response = urlopen(triples_map_element.data_source) + data = json.loads(response.read()) + if isinstance(data, list): + hash_maker(data, triples_map_element, + predicate_object_map.object_map,"", triples_map_list) + elif len(data) < 2: + hash_maker(data[list(data.keys())[0]], triples_map_element, + predicate_object_map.object_map,"", triples_map_list) + else: + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=",") + hash_maker(data, triples_map_element, + predicate_object_map.object_map,"", triples_map_list) + else: + data = json.load(input_file_descriptor) if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) - else: - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - if str(triples_map_element.file_format).lower() == "csv": - data = csv.DictReader(input_file_descriptor, delimiter=",") - hash_maker(data, triples_map_element, - predicate_object_map.object_map) - else: - data = json.load(input_file_descriptor) - if isinstance(data, list): - hash_maker(data, triples_map_element, - predicate_object_map.object_map) - elif len(data) < 2: - hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) - elif triples_map_element.file_format == "XPath": - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - child_tree = ET.parse(input_file_descriptor) - child_root = child_tree.getroot() - hash_maker_xml(child_root, triples_map_element, - predicate_object_map.object_map, parent_map, namespace) + elif triples_map_element.file_format == "XPath": + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + child_tree = ET.parse(input_file_descriptor) + child_root = child_tree.getroot() + hash_maker_xml(child_root, triples_map_element, + predicate_object_map.object_map, parent_map, namespace) + else: + database, query_list = translate_sql(triples_map_element) + db = connector.connect(host=host, port=int(port), user=user, + password=password) + cursor = db.cursor(buffered=True) + cursor.execute("use " + datab) + for query in query_list: + cursor.execute(query) + hash_maker_array(cursor, triples_map_element, + predicate_object_map.object_map) + + if "@" in predicate_object_map.object_map.child[0]: + child_condition = predicate_object_map.object_map.child[0].split("@")[len(predicate_object_map.object_map.child[0].split("@"))-1] + if child_condition in child.attrib: + if child.attrib[child_condition] != None: + if child.attrib[child_condition] in join_table[ + triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]]: + object_list = join_table[triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]][ + child.attrib[child_condition]] + else: + object_list = [] + else: + object_list = [] + else: + if child.find(predicate_object_map.object_map.child[0]) != None: + if child.find(predicate_object_map.object_map.child[0]).text in join_table[ + triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]]: + object_list = join_table[triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]][ + child.find(predicate_object_map.object_map.child[0]).text] else: - database, query_list = translate_sql(triples_map_element) - db = connector.connect(host=host, port=int(port), user=user, - password=password) - cursor = db.cursor(buffered=True) - cursor.execute("use " + datab) - for query in query_list: - cursor.execute(query) - hash_maker_array(cursor, triples_map_element, - predicate_object_map.object_map) + object_list = [] + object = None + else: + if predicate_object_map.object_map.parent != None: + if triples_map_element.triples_map_id + "_" + \ + predicate_object_map.object_map.child[0] not in join_table: + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + child_tree = ET.parse(input_file_descriptor) + child_root = child_tree.getroot() + hash_maker_xml(child_root, triples_map_element, + predicate_object_map.object_map, parent_map, namespace) if "@" in predicate_object_map.object_map.child[0]: child_condition = predicate_object_map.object_map.child[0].split("@")[len(predicate_object_map.object_map.child[0].split("@"))-1] @@ -2157,115 +2892,31 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): object_list = [] object = None else: - if predicate_object_map.object_map.parent != None: - if triples_map_element.triples_map_id + "_" + \ - predicate_object_map.object_map.child[0] not in join_table: - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - child_tree = ET.parse(input_file_descriptor) - child_root = child_tree.getroot() - hash_maker_xml(child_root, triples_map_element, - predicate_object_map.object_map, parent_map, namespace) - - if "@" in predicate_object_map.object_map.child[0]: - child_condition = predicate_object_map.object_map.child[0].split("@")[len(predicate_object_map.object_map.child[0].split("@"))-1] - if child_condition in child.attrib: - if child.attrib[child_condition] != None: - if child.attrib[child_condition] in join_table[ - triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]]: - object_list = join_table[triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]][ - child.attrib[child_condition]] - else: - object_list = [] - else: - object_list = [] - else: - if child.find(predicate_object_map.object_map.child[0]) != None: - if child.find(predicate_object_map.object_map.child[0]).text in join_table[ - triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]]: - object_list = join_table[triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]][ - child.find(predicate_object_map.object_map.child[0]).text] - else: - object_list = [] + try: + object = "<" + string_substitution_xml( + triples_map_element.subject_map.value, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + ">" + except TypeError: object = None - else: - try: - object = "<" + string_substitution_xml( - triples_map_element.subject_map.value, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) + ">" - except TypeError: - object = None - break - else: - continue - else: - object = None + break + else: + continue else: object = None + else: + object = None - if predicate in general_predicates: - dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) - else: - dictionary_table_update(predicate) - if predicate != None and (object != None or object) and subject != None: - for graph in triples_map.subject_map.graph: - dictionary_table_update(subject) - if isinstance(object, list): - for obj in object: - dictionary_table_update(obj) - triple = subject + " " + predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, - "subject", - triples_map.iterator, - parent_map, - namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - else: - dictionary_table_update(object) - triple = subject + " " + predicate + " " + object + ".\n" + if predicate in general_predicates: + dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) + else: + dictionary_table_update(predicate) + if predicate != None and (object != None or object) and subject != None: + for graph in triples_map.subject_map.graph: + dictionary_table_update(subject) + if isinstance(object, list): + for obj in object: + dictionary_table_update(obj) + triple = subject + " " + predicate + " " + obj + ".\n" if graph != None and "defaultGraph" not in graph: if "{" in graph: triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, @@ -2287,89 +2938,97 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]]: output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: if dic_table[predicate] not in g_triples: output_file_descriptor.write(triple) g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate]]: output_file_descriptor.write(triple) g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: output_file_descriptor.write(triple) i += 1 - if predicate[1:-1] in predicate_object_map.graph: - if isinstance(object, list): - for obj in object: - triple = subject + " " + predicate + " " + obj + ".\n" - if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ - predicate_object_map.graph[predicate[1:-1]]: - if "{" in predicate_object_map.graph[predicate[1:-1]]: - triple = triple[:-2] + " <" + string_substitution_xml( - predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], - "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - triple = triple[:-2] + " <" + predicate_object_map.graph[ - predicate[1:-1]] + ">.\n" - dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - predicate + "_" + predicate_object_map.object_map.value]: - output_file_descriptor.write(triple) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[predicate]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) + else: + dictionary_table_update(object) + triple = subject + " " + predicate + " " + object + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, + "subject", + triples_map.iterator, + parent_map, + namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 else: - triple = subject + " " + predicate + " " + object + ".\n" + output_file_descriptor.write(triple) + i += 1 + if predicate[1:-1] in predicate_object_map.graph: + if isinstance(object, list): + for obj in object: + triple = subject + " " + predicate + " " + obj + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ predicate_object_map.graph[predicate[1:-1]]: if "{" in predicate_object_map.graph[predicate[1:-1]]: triple = triple[:-2] + " <" + string_substitution_xml( predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", triples_map.iterator, parent_map, namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") else: - triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + triple = triple[:-2] + " <" + predicate_object_map.graph[ + predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: if dic_table[ @@ -2377,57 +3036,143 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ predicate + "_" + predicate_object_map.object_map.value]: output_file_descriptor.write(triple) g_triples[dic_table[ predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: if dic_table[predicate] not in g_triples: output_file_descriptor.write(triple) g_triples.update({dic_table[predicate]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate]]: output_file_descriptor.write(triple) g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: output_file_descriptor.write(triple) - elif predicate != None and subject != None and object_list: - dictionary_table_update(subject) - for obj in object_list: - dictionary_table_update(obj) - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + object + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution_xml( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + ">.\n" + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + predicate + "_" + predicate_object_map.object_map.value]: + output_file_descriptor.write(triple) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[predicate]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + elif predicate != None and subject != None and object_list: + dictionary_table_update(subject) + for obj in object_list: + dictionary_table_update(obj) + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, + "subject", + triples_map.iterator, + parent_map, + namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + i += 1 + if predicate[1:-1] in predicate_object_map.graph: + triple = subject + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution_xml( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", child, "subject", triples_map.iterator, + parent_map, namespace) + ">") else: - triple = subject + " " + predicate + " " + obj + ".\n" - - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, - "subject", - triples_map.iterator, - parent_map, - namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: if dic_table[ @@ -2440,8 +3185,8 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]]: output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: @@ -2459,61 +3204,15 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): else: output_file_descriptor.write(triple) i += 1 - if predicate[1:-1] in predicate_object_map.graph: - triple = subject + " " + predicate + " " + obj + ".\n" - if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ - predicate_object_map.graph[predicate[1:-1]]: - if "{" in predicate_object_map.graph[predicate[1:-1]]: - triple = triple[:-2] + " <" + string_substitution_xml( - predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], - "{(.+?)}", child, "subject", triples_map.iterator, - parent_map, namespace) + ">") - else: - triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" - dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - object_list = [] - else: - continue + object_list = [] + else: + continue return i def semantify_json(triples_map, triples_map_list, delimiter, output_file_descriptor, data, iterator): logger.info("TM: " + triples_map.triples_map_name) - + global current_logical_dump triples_map_triples = {} generated_triples = {} object_list = [] @@ -2521,6 +3220,8 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip global blank_message global host, port, user, password, datab i = 0 + if iterator == "$[*]": + iterator = "$.[*]" if iterator != "None" and iterator != "$.[*]" and iterator != "": new_iterator = "" temp_keys = iterator.split(".") @@ -2587,8 +3288,10 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip else: i += semantify_json(triples_map, triples_map_list, delimiter, output_file_descriptor, row, iterator.replace(new_iterator[:-1], "")) + elif iterator == "$.[*]": + for row in data: + i += semantify_json(triples_map, triples_map_list, delimiter, output_file_descriptor, row, "") else: - create_subject = True global generated_subjects @@ -2644,9 +3347,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -2655,9 +3367,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -2712,9 +3433,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip try: if " " not in subject_value: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: logger.error(" is an invalid URL") subject = None @@ -2730,9 +3460,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -2844,6 +3583,13 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip object = "\"" + predicate_object_map.object_map.value + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.mapping_type == "template": try: object = string_substitution_json(predicate_object_map.object_map.value, "{(.+?)}", data, "object", @@ -2865,6 +3611,33 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if "." in object_list[i]: object_list[i] = object_list[i].replace(".", "2E") object_list[i] = encode_char(object_list[i]) + else: + if predicate_object_map.object_map.datatype != None: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format( + predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.language != None: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object[i] += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object[i] += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object[i] += "@" + predicate_object_map.object_map.language + else: + object[i] = None + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution_json(predicate_object_map.object_map.language_map, ".+", + data, "object", ignore, iterator) + if lang != None: + object[i] += "@" + string_substitution_json( + predicate_object_map.object_map.language_map, ".+", data, "object", ignore, + iterator)[1:-1] i += 1 else: if predicate_object_map.object_map.term is None: @@ -2885,13 +3658,22 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_json(predicate_object_map.object_map.language_map, ".+", data, "object", ignore, iterator) @@ -2921,13 +3703,22 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object_list[i] = "\"" + object_list[i][1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object_list[i] += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object_list[i] += "@en" elif len(predicate_object_map.object_map.language) == 2: object_list[i] += "@" + predicate_object_map.object_map.language + else: + object_list[i] = None elif predicate_object_map.object_map.language_map != None: object_list[i] += "@" + string_substitution_json( predicate_object_map.object_map.language_map, ".+", data, "object", ignore, @@ -2959,13 +3750,22 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_json(predicate_object_map.object_map.language_map, ".+", data, "object", ignore, iterator) @@ -2993,10 +3793,10 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip data = json.loads(response.read()) if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -3004,16 +3804,17 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip data_element = csv.DictReader(input_file_descriptor, delimiter=delimiter) hash_maker(data_element, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: data_element = json.load(input_file_descriptor) if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]": join_iterator(data_element, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, + triples_map_list) else: hash_maker(data_element[list(data_element.keys())[0]], - triples_map_element, predicate_object_map.object_map) + triples_map_element, predicate_object_map.object_map,"", triples_map_list) elif triples_map_element.file_format == "XPath": with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -3050,15 +3851,15 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip with open(str(triples_map_element.data_source), "r") as input_file_descriptor: if str(triples_map_element.file_format).lower() == "csv": data = csv.DictReader(input_file_descriptor, delimiter=delimiter) - hash_maker(data, triples_map_element, predicate_object_map.object_map) + hash_maker(data, triples_map_element, predicate_object_map.object_map,"", triples_map_list) else: parent_data = json.load(input_file_descriptor) if triples_map_element.iterator != "None": join_iterator(parent_data, triples_map_element.iterator, - triples_map_element, predicate_object_map.object_map) + triples_map_element, predicate_object_map.object_map, triples_map_list) else: hash_maker(parent_data[list(parent_data.keys())[0]], - triples_map_element, predicate_object_map.object_map) + triples_map_element, predicate_object_map.object_map,"", triples_map_list) if "." in predicate_object_map.object_map.child[0]: temp_keys = predicate_object_map.object_map.child[0].split(".") temp_data = data @@ -3206,161 +4007,65 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip object = "<" + string_substitution_json(triples_map_element.subject_map.value, "{(.+?)}", data, "object", ignore, parent_iterator) + ">" - except TypeError: - object = None - break - else: - continue - else: - object = None - else: - object = None - - if predicate in general_predicates: - dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) - else: - dictionary_table_update(predicate) - if predicate != None and object != None and subject != None: - dictionary_table_update(subject) - dictionary_table_update(object) - for graph in triples_map.subject_map.graph: - triple = subject + " " + predicate + " " + object + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", - ignore, iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, - iterator) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - if predicate[1:-1] in predicate_object_map.graph: - triple = subject + " " + predicate + " " + object + ".\n" - if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ - predicate_object_map.graph[predicate[1:-1]]: - if "{" in predicate_object_map.graph[predicate[1:-1]]: - triple = triple[:-2] + " <" + string_substitution_json( - predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", data, "subject", ignore, - iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", - data, "subject", ignore, iterator) + ">") - else: - triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" - dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - predicate + "_" + predicate_object_map.object_map.value]: - output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - elif predicate != None and subject != None and object_list: - dictionary_table_update(subject) - for obj in object_list: - dictionary_table_update(obj) - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" + except TypeError: + object = None + break else: - triple = subject + " " + predicate + " " + obj + ".\n" + continue + else: + object = None + else: + object = None + if is_current_output_valid(triples_map.triples_map_id,predicate_object_map,current_logical_dump,logical_dump): + if predicate in general_predicates: + dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) + else: + dictionary_table_update(predicate) + if predicate != None and object != None and subject != None: + dictionary_table_update(subject) + dictionary_table_update(object) + for graph in triples_map.subject_map.graph: + triple = subject + " " + predicate + " " + object + ".\n" if graph != None and "defaultGraph" not in graph: if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, - "subject", ignore, - iterator) + ">.\n" + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", + ignore, iterator) + ">.\n" dictionary_table_update( "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, iterator) + ">") else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 + if predicate_object_map.graph[predicate[1:-1]] == None or graph != None: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - + output_file_descriptor.write(triple) + i += 1 if predicate[1:-1] in predicate_object_map.graph: - triple = subject + " " + predicate + " " + obj + ".\n" + triple = subject + " " + predicate + " " + object + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ predicate_object_map.graph[predicate[1:-1]]: if "{" in predicate_object_map.graph[predicate[1:-1]]: @@ -3368,180 +4073,279 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", data, "subject", ignore, iterator) + ">.\n" dictionary_table_update( - "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], - "{(.+?)}", data, "subject", ignore, iterator) + ">") + "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", + data, "subject", ignore, iterator) + ">") else: triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) + dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + predicate + "_" + predicate_object_map.object_map.value]: output_file_descriptor.write(triple) g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 else: if dic_table[predicate] not in g_triples: output_file_descriptor.write(triple) g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: output_file_descriptor.write(triple) g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 else: output_file_descriptor.write(triple) i += 1 - object_list = [] - elif predicate != None and subject_list: - for subj in subject_list: - dictionary_table_update(subj) - type_predicate = "" - for rdf_class in triples_map.subject_map.rdf_class: - if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): - for graph in triples_map.subject_map.graph: - obj = "<{}>".format(rdf_class) - dictionary_table_update(obj) - dictionary_table_update(type_predicate + "_" + obj) - rdf_type = subj + " " + type_predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - rdf_type = rdf_type[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, - "subject", ignore, - iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, - iterator) + ">") - else: - rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if dic_table[type_predicate + "_" + obj] not in g_triples: - output_file_descriptor.write(rdf_type) - g_triples.update( - {dic_table[type_predicate + "_" + obj]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ - dic_table[type_predicate + "_" + obj]]: - output_file_descriptor.write(rdf_type) - g_triples[dic_table[type_predicate + "_" + obj]].update( - {dic_table[subj] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(rdf_type) - i += 1 - if object != None: - dictionary_table_update(object) - triple = subj + " " + predicate + " " + object + ".\n" + elif predicate != None and subject != None and object_list: + dictionary_table_update(subject) + for obj in object_list: + dictionary_table_update(obj) for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" if graph != None and "defaultGraph" not in graph: if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", - ignore, iterator) + ">.\n" + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, + "subject", ignore, + iterator) + ">.\n" dictionary_table_update( "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, iterator) + ">") else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subj] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subj] + "_" + dic_table[object]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subj] + "_" + dic_table[object]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - elif object_list: - for obj in object_list: - dictionary_table_update(obj) - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subj + " " + predicate + " <" + obj[1:-1] + ">.\n" + if predicate_object_map.graph[predicate[1:-1]] == None or graph != None: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 else: - triple = subj + " " + predicate + " " + obj + ".\n" + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 else: - triple = subj + " " + predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, - "subject", ignore, - iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, - iterator) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") + output_file_descriptor.write(triple) + i += 1 + + if predicate[1:-1] in predicate_object_map.graph: + triple = subject + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution_json( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", data, "subject", ignore, + iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", data, "subject", ignore, iterator) + ">") + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subj] + "_" + dic_table[obj]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]]: output_file_descriptor.write(triple) g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + i += 1 + object_list = [] + elif predicate != None and subject_list: + for subj in subject_list: + dictionary_table_update(subj) + type_predicate = "" + for rdf_class in triples_map.subject_map.rdf_class: + if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): + for graph in triples_map.subject_map.graph: + obj = "<{}>".format(rdf_class) + dictionary_table_update(obj) + dictionary_table_update(type_predicate + "_" + obj) + rdf_type = subj + " " + type_predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + rdf_type = rdf_type[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, + "subject", ignore, + iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, + iterator) + ">") + else: + rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if duplicate == "yes": + if dic_table[type_predicate + "_" + obj] not in g_triples: + output_file_descriptor.write(rdf_type) + g_triples.update( + {dic_table[type_predicate + "_" + obj]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[type_predicate + "_" + obj]]: + output_file_descriptor.write(rdf_type) + g_triples[dic_table[type_predicate + "_" + obj]].update( {dic_table[subj] + "_" + dic_table[obj]: ""}) i += 1 else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(rdf_type) + i += 1 + if object != None: + dictionary_table_update(object) + triple = subj + " " + predicate + " " + object + ".\n" + for graph in triples_map.subject_map.graph: + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", + ignore, iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, + iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + i += 1 + elif object_list: + for obj in object_list: + dictionary_table_update(obj) + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subj + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subj + " " + predicate + " " + obj + ".\n" + else: + triple = subj + " " + predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, + "subject", ignore, + iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, + iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + else: output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subj] + "_" + dic_table[obj]: ""}) i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - else: - continue - else: - continue + else: + continue + else: + continue return i -def semantify_file(triples_map, triples_map_list, delimiter, output_file_descriptor, data): +def semantify_file(triples_map, triples_map_list, delimiter, output_file_descriptor, data, no_inner_cycle): """ (Private function, not accessible from outside this package) @@ -3569,6 +4373,7 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip """ object_list = [] + subject_list = [] triples_string = "" end_turtle = "" i = 0 @@ -3645,9 +4450,18 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -3656,9 +4470,18 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip # if row[field] == condition: try: if "http" not in subject_value: - subject = subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -3711,27 +4534,98 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip subject_value = subject_value[1:-1] if triples_map.subject_map.condition == "": if " " not in subject_value: - if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if "BlankNode" in triples_map.subject_map.term_type: + subject = "_:" + subject_value else: - subject = "<" + subject_value + ">" + if "http" not in subject_value: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: subject = None else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - if "http" not in subject_value: - subject = "<" + base + subject_value + ">" - else: - subject = "<" + subject_value + ">" - except: - subject = None + subject = None elif "constant" in triples_map.subject_map.subject_mapping_type: subject = "<" + triples_map.subject_map.value + ">" + elif "function" in triples_map.subject_map.subject_mapping_type: + subject = None + if new_formulation == "no": + temp_dics = [] + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == triples_map.subject_map.value: + dic = create_dictionary(triples_map_element) + current_func = {"output_name":"OUTPUT", + "inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic} + for inputs in dic["inputs"]: + temp_dic = {} + if "reference function" in inputs: + temp_dic = {"inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic, + "id":triples_map_element.triples_map_id} + if inner_function_exists(temp_dic, temp_dics): + temp_dics.append(temp_dic) + if temp_dics: + func = inner_function(row,current_func,triples_map_list) + subject = "<" + encode_char(func) + ">" + else: + func = execute_function(row,current_func) + subject = "<" + encode_char(func) + ">" + else: + func = None + for func_map in triples_map.func_map_list: + if func_map.func_map_id == triples_map.subject_map.value: + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + inner_func = False + for param in func_map.parameters: + if "function" in func_map.parameters[param]["type"]: + inner_func = True + if inner_func: + func = new_inner_function(row,triples_map.subject_map.value,triples_map) + else: + func = execute_function(row,None,current_func) + if triples_map.subject_map.func_result != None and func != None: + func = func[triples_map.subject_map.func_result] + if func != None: + if "http://" in func or "https://" in func: + subject = "<" + func + ">" + else: + subject = "<" + encode_char(func) + ">" + else: + subject = None + elif "quoted triples map" in triples_map.subject_map.subject_mapping_type: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == triples_map.subject_map.value: + if triples_map_element.data_source != triples_map.data_source: + if triples_map.subject_map.parent != None: + if ("quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child) not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + with open(str(triples_map_element.data_source), "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=',') + hash_maker(data, triples_map_element, triples_map.subject_map, "quoted", triples_map_list) + else: + pass + if row[triples_map.subject_map.child] in join_table["quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child]: + subject_list = join_table["quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child][row[triples_map.subject_map.child]] + else: + subject_list = inner_semantify_file(triples_map_element, triples_map_list, delimiter, row, base) + subject = None else: if triples_map.subject_map.condition == "": @@ -3760,7 +4654,7 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if triples_map.subject_map.rdf_class != [None] and subject != None: predicate = "" for rdf_class in triples_map.subject_map.rdf_class: - if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): + if rdf_class != None and rdf_class != "None" and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): obj = "<{}>".format(rdf_class) rdf_type = subject + " " + predicate + " " + obj + ".\n" for graph in triples_map.subject_map.graph: @@ -3775,35 +4669,36 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - dictionary_table_update(subject) - dictionary_table_update(obj) - dictionary_table_update(predicate + "_" + obj) - if dic_table[predicate + "_" + obj] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(rdf_type) + if no_inner_cycle: + if duplicate == "yes": + dictionary_table_update(subject) + dictionary_table_update(obj) + dictionary_table_update(predicate + "_" + obj) + if dic_table[predicate + "_" + obj] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(rdf_type) + else: + output_file_descriptor.write(subject + " a " + determine_prefix(obj)) + g_triples.update( + {dic_table[predicate + "_" + obj]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + obj]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(rdf_type) + else: + output_file_descriptor.write(subject + " a " + determine_prefix(obj)) + g_triples[dic_table[predicate + "_" + obj]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 else: - output_file_descriptor.write(subject + " a " + determine_prefix(obj)) - g_triples.update( - {dic_table[predicate + "_" + obj]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + obj]]: + duplicate_type = True + else: if output_format.lower() == "n-triples": output_file_descriptor.write(rdf_type) else: output_file_descriptor.write(subject + " a " + determine_prefix(obj)) - g_triples[dic_table[predicate + "_" + obj]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 - else: - duplicate_type = True - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(rdf_type) - else: - output_file_descriptor.write(subject + " a " + determine_prefix(obj)) - i += 1 if output_format.lower() == "turtle" and len(triples_map.predicate_object_maps_list) == 0: output_file_descriptor.write(".\n") @@ -3844,6 +4739,29 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip predicate = string_substitution(predicate_object_map.predicate_map.value, ".+", row, "predicate", ignore, triples_map.iterator) predicate = "<" + predicate[1:-1] + ">" + elif predicate_object_map.predicate_map.mapping_type == "function": + if new_formulation == "yes": + func = None + for func_map in triples_map.func_map_list: + if func_map.func_map_id == predicate_object_map.predicate_map.value: + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + inner_func = False + for param in func_map.parameters: + if "function" in func_map.parameters[param]["type"]: + inner_func = True + if inner_func: + func = new_inner_function(row,predicate_object_map.predicate_map.value,triples_map) + else: + func = execute_function(row,None,current_func) + if predicate_object_map.predicate_map.func_result != None and func != None: + func = func[predicate_object_map.predicate_map.func_result] + if None != func: + predicate = "<" + func + ">" + else: + predicate = None + else: + predicate = None else: predicate = None @@ -3858,6 +4776,14 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: object = "\"" + object[1:-1] + "\"" + "^^{}".format( determine_prefix(predicate_object_map.object_map.datatype)) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + "object", ignore, triples_map.iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.mapping_type == "template": try: if predicate_object_map.object_map.term is None: @@ -3883,13 +4809,22 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + "object", ignore, triples_map.iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object", ignore, triples_map.iterator) @@ -3917,13 +4852,22 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: object = "\"" + object[1:-1] + "\"" + "^^{}".format( determine_prefix(predicate_object_map.object_map.datatype)) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + "object", ignore, triples_map.iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object", ignore, triples_map.iterator) @@ -3937,6 +4881,11 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip object = "<" + encode_char(object[1:-1]) + ">" else: object = None + elif "BlankNode" in predicate_object_map.object_map.term: + if " " not in object: + object = "_:" + object[1:-1] + else: + object = None elif predicate_object_map.object_map.mapping_type == "parent triples map": if subject != None: for triples_map_element in triples_map_list: @@ -3954,22 +4903,22 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]" and triples_map_element.iterator != "[*]": join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -3980,30 +4929,30 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip reader = reader.drop_duplicates(keep='first') data = reader.to_dict(orient='records') hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: data = json.load(input_file_descriptor) if triples_map_element.iterator: if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]" and triples_map_element.iterator != "[*]": join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif triples_map_element.file_format == "XPath": with open(str(triples_map_element.data_source), @@ -4046,23 +4995,24 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]": join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, + triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -4084,23 +5034,24 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, + triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) if child_list_value(predicate_object_map.object_map.child, row) in \ join_table[triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]]: @@ -4177,44 +5128,35 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip object = None else: if predicate_object_map.object_map.parent != None: - if predicate_object_map.object_map.parent[0] != \ - predicate_object_map.object_map.child[0]: - if (triples_map_element.triples_map_id + "_" + child_list( - predicate_object_map.object_map.child)) not in join_table: - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - if str(triples_map_element.file_format).lower() == "csv": - parent_data = csv.DictReader(input_file_descriptor, - delimiter=delimiter) + if (triples_map_element.triples_map_id + "_" + child_list( + predicate_object_map.object_map.child)) not in join_table: + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + parent_data = csv.DictReader(input_file_descriptor, + delimiter=delimiter) + hash_maker_list(parent_data, triples_map_element, + predicate_object_map.object_map) + else: + parent_data = json.load(input_file_descriptor) + if isinstance(parent_data, list): hash_maker_list(parent_data, triples_map_element, predicate_object_map.object_map) else: - parent_data = json.load(input_file_descriptor) - if isinstance(parent_data, list): - hash_maker_list(parent_data, triples_map_element, - predicate_object_map.object_map) - else: - hash_maker_list(parent_data[list(parent_data.keys())[0]], - triples_map_element, - predicate_object_map.object_map) - if sublist(predicate_object_map.object_map.child, row.keys()): - if child_list_value(predicate_object_map.object_map.child, row) in \ - join_table[triples_map_element.triples_map_id + "_" + child_list( - predicate_object_map.object_map.child)]: - object_list = join_table[ - triples_map_element.triples_map_id + "_" + child_list( - predicate_object_map.object_map.child)][ - child_list_value(predicate_object_map.object_map.child, row)] - else: - object_list = [] - object = None - else: - try: - object = "<" + string_substitution(triples_map_element.subject_map.value, - "{(.+?)}", row, "object", ignore, - triples_map.iterator) + ">" - except TypeError: - object = None + hash_maker_list(parent_data[list(parent_data.keys())[0]], + triples_map_element, + predicate_object_map.object_map) + if sublist(predicate_object_map.object_map.child, row.keys()): + if child_list_value(predicate_object_map.object_map.child, row) in \ + join_table[triples_map_element.triples_map_id + "_" + child_list( + predicate_object_map.object_map.child)]: + object_list = join_table[ + triples_map_element.triples_map_id + "_" + child_list( + predicate_object_map.object_map.child)][ + child_list_value(predicate_object_map.object_map.child, row)] + else: + object_list = [] + object = None else: try: object = "<" + string_substitution(triples_map_element.subject_map.value, @@ -4227,6 +5169,94 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip continue else: object = None + elif predicate_object_map.object_map.mapping_type == "reference function": + object = None + if new_formulation == "no": + temp_dics = [] + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == predicate_object_map.object_map.value: + dic = create_dictionary(triples_map_element) + current_func = {"inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic} + for inputs in dic["inputs"]: + temp_dic = {} + if "reference function" in inputs: + temp_dic = {"inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic, + "id":triples_map_element.triples_map_id} + if inner_function_exists(temp_dic, temp_dics): + temp_dics.append(temp_dic) + if temp_dics: + func = inner_function(row,current_func,triples_map_list) + if predicate_object_map.object_map.term is not None: + if "IRI" in predicate_object_map.object_map.term: + object = "<" + encode_char(func) + ">" + else: + if "" != func: + object = "\"" + func + "\"" + else: + object = None + else: + if predicate_object_map.object_map.term is not None: + func = execute_function(row,None,current_func) + if "IRI" in predicate_object_map.object_map.term: + object = "<" + encode_char(func) + ">" + else: + func = execute_function(row,None,current_func) + if "" != func: + object = "\"" + func + "\"" + else: + object = None + else: + func = None + for func_map in triples_map.func_map_list: + if func_map.func_map_id == predicate_object_map.object_map.value: + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + inner_func = False + for param in func_map.parameters: + if "function" in func_map.parameters[param]["type"]: + inner_func = True + if inner_func: + func = new_inner_function(row,predicate_object_map.object_map.value,triples_map) + else: + func = execute_function(row,None,current_func) + if predicate_object_map.object_map.func_result != None and func != None: + func = func[predicate_object_map.object_map.func_result] + if predicate_object_map.object_map.term is not None: + if func != None: + if "IRI" in predicate_object_map.object_map.term: + if "http://" in func.lower() or "https://" in func.lower(): + object = "<" + func + ">" + else: + object = "<" + encode_char(func) + ">" + else: + object = None + else: + if None != func: + object = "\"" + func + "\"" + else: + object = None + elif "quoted triples map" in predicate_object_map.object_map.mapping_type: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == predicate_object_map.object_map.value: + if triples_map_element.data_source != triples_map.data_source: + if predicate_object_map.object_map.parent != None: + if ("quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]) not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + with open(str(triples_map_element.data_source), "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=',') + hash_maker(data, triples_map_element, predicate_object_map.object_map, "quoted", triples_map_list) + else: + pass + if row[predicate_object_map.object_map.child[0]] in join_table["quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]]: + object_list = join_table["quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]][row[predicate_object_map.object_map.child[0]]] + else: + object_list = inner_semantify_file(triples_map_element, triples_map_list, delimiter, row, base) + object = None else: object = None @@ -4348,66 +5378,67 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - - if duplicate == "yes": - dictionary_table_update(subject) - dictionary_table_update(object) - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - generated += 1 - else: - if dic_table[predicate] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subject) + dictionary_table_update(object) + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 generated += 1 - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - i += 1 - generated += 1 if predicate[1:-1] in predicate_object_map.graph: triple = subject + " " + predicate + " " + object + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ @@ -4422,116 +5453,32 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - predicate + "_" + predicate_object_map.object_map.value]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - generated += 1 - else: - if dic_table[predicate] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate]]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - generated += 1 - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - i += 1 - generated += 1 - elif predicate != None and subject != None and object_list: - for obj in object_list: - if obj != None: - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", - ignore, - triples_map.iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, - triples_map.iterator) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") + if no_inner_cycle: if duplicate == "yes": - dictionary_table_update(subject) - dictionary_table_update(obj) if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) + dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + predicate + "_" + predicate_object_map.object_map.value]: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 generated += 1 else: @@ -4539,44 +5486,309 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ dic_table[predicate]]: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + end_turtle = turtle_print(subject, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + i += 1 + generated += 1 + elif predicate != None and subject != None and object_list: + for obj in object_list: + if obj != None: + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + if "quoted triples map" in predicate_object_map.object_map.mapping_type: + triple = subject + " " + predicate + " <<" + obj + ">>.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", + ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subject) + dictionary_table_update(obj) + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + i += 1 + generated += 1 + if predicate[1:-1] in predicate_object_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") + if no_inner_cycle: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, + output_file_descriptor, generated) + i += 1 + generated += 1 + object_list = [] + elif predicate != None and subject_list and object != None: + dictionary_table_update(object) + for subj in subject_list: + if subj != None: + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = "<<" + subj + ">> " + predicate + " <" + object[1:-1] + ">.\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + object + ".\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + object + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", + ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subj) + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) i += 1 generated += 1 - - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - i += 1 - generated += 1 - if predicate[1:-1] in predicate_object_map.graph: if predicate_object_map.object_map.term != None: if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + triple = subj + " " + predicate + " <" + object[1:-1] + ">.\n" else: - triple = subject + " " + predicate + " " + obj + ".\n" + triple = subj + " " + predicate + " " + object + ".\n" else: - triple = subject + " " + predicate + " " + obj + ".\n" + triple = subj + " " + predicate + " " + object + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ predicate_object_map.graph[predicate[1:-1]]: if "{" in predicate_object_map.graph[predicate[1:-1]]: @@ -4590,72 +5802,254 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, - generated) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, - generated) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - generated += 1 + if no_inner_cycle: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) else: - if dic_table[predicate] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) + end_turtle = turtle_print(subj, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, + output_file_descriptor, generated) + i += 1 + generated += 1 + subject_list = [] + elif predicate != None and subject_list and object_list: + for subj in subject_list: + for obj in object_list: + if obj != None and subj != None: + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = "<<" + subj + ">> " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + obj + ".\n" + else: + if "quoted triples map" in predicate_object_map.object_map.mapping_type: + triple = "<<" + subj + ">> " + predicate + " <<" + obj + ">>.\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", + ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subj) + dictionary_table_update(obj) + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, - generated) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + + else: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, + end_turtle = turtle_print(subj, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 generated += 1 + if predicate[1:-1] in predicate_object_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subj + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subj + " " + predicate + " " + obj + ".\n" else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) + triple = subj + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") else: - end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, - predicate_object_map, triples_map, - output_file_descriptor, generated) - i += 1 - generated += 1 + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") + if no_inner_cycle: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, + output_file_descriptor, generated) + i += 1 + generated += 1 object_list = [] + subject_list = [] else: continue return i @@ -4742,9 +6136,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -4753,9 +6156,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -4811,9 +6223,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file subject_value = subject_value[1:-1] if " " not in subject_value: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: logger.error(" is an invalid URL") subject = None @@ -4830,9 +6251,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -4930,6 +6360,13 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file object = "\"" + predicate_object_map.object_map.value + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + if predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_array(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.mapping_type == "template": try: if predicate_object_map.object_map.term is None: @@ -4954,13 +6391,22 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file row_headers, "object", ignore) + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_array(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_array(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -4983,13 +6429,22 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file object = object.replace("\n", "\\n") if predicate_object_map.object_map.datatype != None: object += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_array(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_array(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -5028,7 +6483,7 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file with open(str(triples_map_element.data_source), "r") as input_file_descriptor: if str(triples_map_element.file_format).lower() == "csv": data = csv.DictReader(input_file_descriptor, delimiter=",") - hash_maker(data, triples_map_element, predicate_object_map.object_map) + hash_maker(data, triples_map_element, predicate_object_map.object_map,"", triples_map_list) else: data = json.load(input_file_descriptor) if isinstance(data, list): @@ -5234,45 +6689,46 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples.update({dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples[dic_table[predicate]].update({dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 else: - if dic_table[predicate] not in g_triples: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples.update({dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples[dic_table[predicate]].update({dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) i += 1 if predicate[1:-1] in predicate_object_map.graph: triple = subject + " " + predicate + " " + object + ".\n" @@ -5533,9 +6989,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -5544,9 +7009,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -5605,9 +7079,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f try: if " " not in subject_value: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: logger.error(" is an invalid URL") subject = None @@ -5628,9 +7111,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -5742,6 +7234,13 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f object = "\"" + predicate_object_map.object_map.value + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_postgres(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.mapping_type == "template": try: if predicate_object_map.object_map.term is None: @@ -5766,13 +7265,22 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f row_headers, "object", ignore) + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_postgres(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_postgres(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -5795,13 +7303,22 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f object = object.replace("\n", "\\n") if predicate_object_map.object_map.datatype != None: object += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_postgres(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_postgres(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -5836,11 +7353,11 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f with open(str(triples_map_element.data_source), "r") as input_file_descriptor: if str(triples_map_element.file_format).lower() == "csv": data = csv.DictReader(input_file_descriptor, delimiter=",") - hash_maker(data, triples_map_element, predicate_object_map.object_map) + hash_maker(data, triples_map_element, predicate_object_map.object_map,"", triples_map_list) else: data = json.load(input_file_descriptor) hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif triples_map_element.file_format == "XPath": with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -6209,6 +7726,8 @@ def semantify(config_path, log_path='error.log'): global blank_message global generated_subjects global user, password, port, host, datab + global current_logical_dump + global g_triples start = time.time() if config["datasets"]["all_in_one_file"] == "no": @@ -6242,15 +7761,9 @@ def semantify(config_path, log_path='error.log'): for source_type in order_list: if source_type == "csv": for source in order_list[source_type]: - if enrichment == "yes": - if ".csv" in source: - reader = pd.read_csv(source, dtype=str, encoding="latin-1") - else: - reader = pd.read_csv(source, dtype=str, sep='\t', encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6259,13 +7772,91 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6274,7 +7865,7 @@ def semantify(config_path, log_path='error.log'): generated_subjects = release_subjects( sorted_sources[source_type][source][triples_map], generated_subjects) - else: + elif "endpoint:" in source: for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6283,26 +7874,304 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: - with open(source, "r", encoding="latin-1") as input_file_descriptor: - if ".csv" in source: - data = csv.DictReader(input_file_descriptor, delimiter=',') - else: - data = csv.DictReader(input_file_descriptor, delimiter='\t') - blank_message = True - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if enrichment == "yes": + if ".csv" in source: + if source in delimiter: + reader = pd.read_csv(source, dtype=str, sep=delimiter[source], encoding="latin-1") + else: + reader = pd.read_csv(source, dtype=str, encoding="latin-1") + else: + reader = pd.read_csv(source, dtype=str, sep='\t', encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + with open(source, "r", encoding="latin-1") as input_file_descriptor: + if ".csv" in source: + if source in delimiter: + data = csv.DictReader(input_file_descriptor, delimiter=delimiter[source]) + else: + data = csv.DictReader(input_file_descriptor, delimiter=',') + else: + data = csv.DictReader(input_file_descriptor, delimiter='\t') + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in order_list[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6315,12 +8184,105 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load(open(source)) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6346,6 +8308,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6362,16 +8390,9 @@ def semantify(config_path, log_path='error.log'): for source_type in sorted_sources: if source_type == "csv": for source in sorted_sources[source_type]: - if enrichment == "yes": - if ".csv" in source: - reader = pd.read_csv(source, dtype=str, encoding="latin-1") # latin-1 - else: - reader = pd.read_csv(source, dtype=str, sep="\t", header=0, - encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6380,13 +8401,91 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6395,7 +8494,7 @@ def semantify(config_path, log_path='error.log'): generated_subjects = release_subjects( sorted_sources[source_type][source][triples_map], generated_subjects) - else: + elif "endpoint:" in source: for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6404,26 +8503,305 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: - blank_message = True - with open(source, "r", encoding="latin-1") as input_file_descriptor: - if ".csv" in source: - data = csv.DictReader(input_file_descriptor, delimiter=',') - else: - data = csv.DictReader(input_file_descriptor, delimiter='\t') - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if enrichment == "yes": + if ".csv" in source: + if source in delimiter: + reader = pd.read_csv(source, dtype=str, sep=delimiter[source], encoding="latin-1") + else: + reader = pd.read_csv(source, dtype=str, encoding="latin-1") # latin-1 + else: + reader = pd.read_csv(source, dtype=str, sep="\t", header=0, + encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + for triples_map in sorted_sources[source_type][source]: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + for triples_map in sorted_sources[source_type][source]: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + with open(source, "r", encoding="latin-1") as input_file_descriptor: + if ".csv" in source: + if source in delimiter: + data = csv.DictReader(input_file_descriptor, delimiter=delimiter[source]) + else: + data = csv.DictReader(input_file_descriptor, delimiter=',') + else: + data = csv.DictReader(input_file_descriptor, delimiter='\t') + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in sorted_sources[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6436,13 +8814,106 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load(open( sorted_sources[source_type][source][triples_map].data_source)) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6468,6 +8939,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6524,8 +9061,87 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: - for row in cursor: - if config[dataset_i]["db"].lower() != "none": + if config[dataset_i]["db"].lower() != "none": + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in cursor: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6541,7 +9157,87 @@ def semantify(config_path, log_path='error.log'): "password"], config[dataset_i]["db"], predicate).result() - else: + data.append(row) + else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in cursor: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6556,10 +9252,88 @@ def semantify(config_path, log_path='error.log'): config[dataset_i][ "password"], "None", predicate).result() - data.append(row) + data.append(row) else: - for row in data: - if config[dataset_i]["db"].lower() != "none": + if config[dataset_i]["db"].lower() != "none": + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + with open(dump_output, "w") as logical_output_descriptor: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + for row in data: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in data: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6575,7 +9349,85 @@ def semantify(config_path, log_path='error.log'): "password"], config[dataset_i]["db"], predicate).result() - else: + else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + for row in data: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in data: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6633,6 +9485,81 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + cursor.execute(source) + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in cursor: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -6647,6 +9574,80 @@ def semantify(config_path, log_path='error.log'): predicate).result() data.append(row) else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + for row in data: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in data: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -6700,12 +9701,9 @@ def semantify(config_path, log_path='error.log'): for source_type in order_list: if source_type == "csv": for source in order_list[source_type]: - if enrichment == "yes": - reader = pd.read_csv(source, encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6714,13 +9712,91 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6729,7 +9805,7 @@ def semantify(config_path, log_path='error.log'): generated_subjects = release_subjects( sorted_sources[source_type][source][triples_map], generated_subjects) - else: + elif "endpoint:" in source: for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6738,23 +9814,292 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: - blank_message = True - with open(source, "r", encoding="latin-1") as input_file_descriptor: - data = csv.DictReader(input_file_descriptor, delimiter=',') - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if enrichment == "yes": + reader = pd.read_csv(source, encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + with open(source, "r", encoding="latin-1") as input_file_descriptor: + data = csv.DictReader(input_file_descriptor, delimiter=',') + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in order_list[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6767,13 +10112,106 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load( sorted_sources[source_type][source][triples_map].data_source) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6799,6 +10237,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6815,12 +10319,9 @@ def semantify(config_path, log_path='error.log'): for source_type in sorted_sources: if source_type == "csv": for source in sorted_sources[source_type]: - if enrichment == "yes": - reader = pd.read_csv(source, encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6829,13 +10330,192 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + elif "endpoint:" in source: + for triples_map in sorted_sources[source_type][source]: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6845,31 +10525,199 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][triples_map], generated_subjects) else: - with open(source, "r", encoding="latin-1") as input_file_descriptor: - data = csv.DictReader(input_file_descriptor, delimiter=',') + if enrichment == "yes": + reader = pd.read_csv(source, encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') for triples_map in sorted_sources[source_type][source]: - if (len(sorted_sources[source_type][source][ - triples_map].predicate_object_maps_list) > 0 and - sorted_sources[source_type][source][ - triples_map].predicate_object_maps_list[ - 0].predicate_map.value != "None") or \ + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and sorted_sources[source_type][source][ - triples_map].subject_map.rdf_class != [None]: - blank_message = True - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + with open(source, "r", encoding="latin-1") as input_file_descriptor: + data = csv.DictReader(input_file_descriptor, delimiter=',') + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in sorted_sources[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6882,13 +10730,106 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load(open( sorted_sources[source_type][source][triples_map].data_source)) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6914,6 +10855,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6971,6 +10978,101 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + current_logical_dump = dump_output + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + if config[dataset_i]["db"].lower() != "none": + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + else: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in cursor: if config[dataset_i]["db"].lower() != "none": number_triple += executor.submit(semantify_mysql, row, @@ -7005,38 +11107,133 @@ def semantify(config_path, log_path='error.log'): predicate).result() data.append(row) else: - for row in data: - if config[dataset_i]["db"].lower() != "none": - number_triple += executor.submit(semantify_mysql, row, - row_headers, - sorted_sources[ - source_type][source][ - triples_map], - triples_map_list, - output_file_descriptor, - config[dataset_i]["host"], - int(config[dataset_i][ - "port"]), - config[dataset_i]["user"], - config[dataset_i][ - "password"], - config[dataset_i]["db"], - predicate).result() - else: - number_triple += executor.submit(semantify_mysql, row, - row_headers, - sorted_sources[ - source_type][source][ - triples_map], - triples_map_list, - output_file_descriptor, - config[dataset_i]["host"], - int(config[dataset_i][ - "port"]), - config[dataset_i]["user"], - config[dataset_i][ - "password"], "None", - predicate).result() + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + for row in data: + if config[dataset_i]["db"].lower() != "none": + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + else: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + else: + for row in data: + if config[dataset_i]["db"].lower() != "none": + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + output_file_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + else: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + output_file_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -7081,6 +11278,81 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in cursor: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -7095,6 +11367,80 @@ def semantify(config_path, log_path='error.log'): predicate).result() data.append(row) else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in data: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in data: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -7123,28 +11469,4 @@ def semantify(config_path, log_path='error.log'): duration = time.time() - start_time - logger.info("Successfully semantified all datasets in {:.3f} seconds.".format(duration)) - - -""" -According to the meeting held on 11.04.2018, semantifying json files != a top priority right -now, thus the reimplementation of following functions remain largely undocumented and unfinished. - -def json_generator(file_descriptor, iterator): - if len(iterator) != 0: - if "[*]" not in iterator[0] and iterator[0] != "$": - yield from json_generator(file_descriptor[iterator[0]], iterator[1:]) - elif "[*]" not in iterator[0] and iterator[0] == "$": - yield from json_generator(file, iterator[1:]) - elif "[*]" in iterator[0] and "$" not in iterator[0]: - file_array = file_descriptor[iterator[0].replace("[*]","")] - for array_elem in file_array: - yield from json_generator(array_elem, iterator[1:]) - elif iterator[0] == "$[*]": - for array_elem in file_descriptor: - yield from json_generator(array_elem, iterator[1:]) - else: - yield file_descriptor - - -""" + logger.info("Successfully semantified all datasets in {:.3f} seconds.".format(duration)) \ No newline at end of file diff --git a/rdfizer/rdfizer/fnml_functions.py b/rdfizer/rdfizer/fnml_functions.py new file mode 100644 index 0000000..22454cc --- /dev/null +++ b/rdfizer/rdfizer/fnml_functions.py @@ -0,0 +1,151 @@ +import re +import sys +import os +from .functions import * +################################################################################################ +############################ Static (Do NOT change this code) ################################## +################################################################################################ + +global global_dic +global_dic = {} +global functions_pool + +##################################################################################################### +########### ADD THE IMPLEMENTATION OF YOUR FUNCTIONS HERE FOLLOWING THE EXAMPLES #################### +##################################################################################################### + +functions_pool = {"toLowerCase":"","toUpperCase":"","toUpperCaseURL":"", + "replaceValue":"","concat2":"","uuid":"","helloworld":"", + "escape":"","schema":"","string_replace":"", + "parseURL":""} + + +## Define your functions here following examples below, the column "names" from the csv files +## that you aim to use as the input parameters of functions are only required to be provided +## as the keys of "global_dic" +def toLowerCase(): + return str(global_dic["valueParam"]).lower() + +def toUpperCase(): + return str(global_dic["valueParam"]).upper() + +def helloworld(): + return "Hello World!" + +def string_replace(): + return global_dic["valueParam"].replace(global_dic["param_find"],global_dic["param_replace"]) + +def parseURL(): + parsed = {} + parsed["protocolOutput"] = global_dic["stringParameter"].split("://")[0] + if "#" in global_dic["stringParameter"]: + parsed["stringOutput"] = global_dic["stringParameter"].split("://")[1].split("#")[1] + parsed["domainOutput"] = global_dic["stringParameter"].split("://")[1].split("#")[0] + else: + parsed["stringOutput"] = global_dic["stringParameter"].split("://")[1].split("/")[len(global_dic["stringParameter"].split("://")[1].split("/"))-1] + replace_end = "/" + parsed["stringOutput"] + parsed["domainOutput"] = global_dic["stringParameter"].split("://")[1].replace(replace_end,"") + return parsed + +def concat2(): + value1 = global_dic["value1"] + value2 = global_dic["value2"] + if bool(value1) and bool(value2): + result = str(str(value1)+str(value2)) + else: + result = "" + return(result) + +def uuid(): + from uuid import uuid4 + return str(uuid4()) + +def escape(): + if global_dic["modeParam"] == 'html': + import html + return html.escape(global_dic["valueParam"]) + elif global_dic["modeParam"] == 'url': + import urllib.parse + return urllib.parse.quote(global_dic["valueParam"]) + else: + raise ValueError("Invalid mode. Use 'html' for HTML escaping or 'url' for URL escaping.") + +def toUpperCaseURL(): + url_lower = global_dic["str"].lower() + + if url_lower.startswith('https://'): + return global_dic["str"].upper() + elif url_lower.startswith('http://'): + return global_dic["str"].upper() + + # else: + return f'http://{encode_char(global_dic["str"].upper())}' + +def schema(): + return "https://schema.org/" + encode_char(global_dic["stringParameter"]) +################################################################################################ +############################ Static (Do NOT change this code) ################################## +################################################################################################ + +def execute_function(row,header,dic): + if "#" in dic["function"]: + func = dic["function"].split("#")[1] + else: + func = dic["function"].split("/")[len(dic["function"].split("/"))-1] + if func in functions_pool: + global global_dic + global_dic = execution_dic(row,header,dic) + if global_dic == None: + print("Error when executing function") + return None + else: + return eval(func + "()") + else: + print("Invalid function") + print("Aborting...") + sys.exit(1) + +def execution_dic(row,header,dic): + output = {} + for inputs in dic["inputs"]: + if isinstance(inputs,list): + if "constant" not in inputs: + if "reference" in inputs[1]: + if isinstance(row,dict): + output[inputs[2]] = row[inputs[0]] + else: + output[inputs[2]] = row[header.index(inputs[0])] + elif "template" in inputs: + if isinstance(row,dict): + output[inputs[2]] = string_substitution(inputs[0], "{(.+?)}", row, "subject", "yes", "None") + else: + output[inputs[2]] = string_substitution_array(inputs[0], "{(.+?)}", row, header, "subject", "yes") + else: + output[inputs[2]] = inputs[0] + else: + if "#" in inputs: + param = inputs.split("#")[1] + else: + param = inputs.split("/")[len(inputs.split("/"))-1] + if "constant" != dic["inputs"][inputs]["type"]: + if "reference" == dic["inputs"][inputs]["type"]: + if isinstance(row,dict): + if dic["inputs"][inputs]["value"] in row: + output[param] = row[dic["inputs"][inputs]["value"]] + else: + return None + else: + if dic["inputs"][inputs]["value"] in header: + output[param] = row[header.index(dic["inputs"][inputs]["value"])] + else: + return None + elif "template" == dic["inputs"][inputs]["type"]: + if isinstance(row,dict): + output[param] = string_substitution(dic["inputs"][inputs]["value"], "{(.+?)}", row, "subject", "yes", "None") + else: + output[param] = string_substitution_array(dic["inputs"][inputs]["value"], "{(.+?)}", row, header, "subject", "yes") + if output[param] == None: + return None + else: + output[param] = dic["inputs"][inputs]["value"] + return output \ No newline at end of file diff --git a/rdfizer/rdfizer/functions.py b/rdfizer/rdfizer/functions.py index c4cc837..a7c0f83 100644 --- a/rdfizer/rdfizer/functions.py +++ b/rdfizer/rdfizer/functions.py @@ -5,6 +5,109 @@ import xml.etree.ElementTree as ET import urllib import math +import rdflib + +def generate_rdfjson(graph): + json_data = {} + for subj, pred, obj in graph: + if subj not in json_data: + json_data[subj] = {pred:[{"value":obj}]} + else: + if pred not in json_data[subj]: + json_data[subj][pred] = [{"value":obj}] + else: + json_data[subj][pred].append({"value":obj}) + return json_data + + +def extract_prefixes_from_ttl(ttl_file): + g = rdflib.Graph() + g.parse(ttl_file, format="ttl") + + prefixes = {} + for prefix, uri in g.namespaces(): + prefixes[prefix] = uri + + return prefixes + +def is_repeat_output(current_output,output_list): + for source in output_list: + if current_output != output_list: + if output_list[source] == output_list[current_output]: + keys = list(output_list.keys()) + if keys.index(source) < keys.index(current_output): + return source + else: + return "" + return "" + +def is_current_output_valid(triples_map_id,po_map,current_output,output_list): + if current_output == "": + if triples_map_id in output_list: + for possible_output in output_list[triples_map_id]: + if output_list[triples_map_id][possible_output] == "subject": + return False + elif po_map.predicate_map.value in output_list[triples_map_id][possible_output]: + return False + elif po_map.object_map.value in output_list[triples_map_id][possible_output]: + return False + elif po_map.object_map.datatype != None: + if po_map.object_map.value + "_" + po_map.object_map.datatype in output_list[triples_map_id][possible_output]: + return False + elif po_map.object_map.datatype_map != None: + if po_map.object_map.value + "_" + po_map.object_map.datatype_map in output_list[triples_map_id][possible_output]: + return False + elif po_map.object_map.language != None: + if po_map.object_map.value + "_" + po_map.object_map.language in output_list[triples_map_id][possible_output]: + return False + elif po_map.object_map.language_map != None: + if po_map.object_map.value + "_" + po_map.object_map.language_map in output_list[triples_map_id][possible_output]: + return False + return True + else: + return True + else: + if triples_map_id in output_list: + if current_output in output_list[triples_map_id]: + if output_list[triples_map_id][current_output] == "subject": + return True + elif po_map.predicate_map.value in output_list[triples_map_id][current_output]: + return True + elif po_map.object_map.value in output_list[triples_map_id][current_output]: + return True + elif po_map.object_map.datatype != None: + if po_map.object_map.value + "_" + po_map.object_map.datatype in output_list[triples_map_id][current_output]: + return True + else: + return False + elif po_map.object_map.datatype_map != None: + if po_map.object_map.value + "_" + po_map.object_map.datatype_map in output_list[triples_map_id][current_output]: + return True + else: + return False + elif po_map.object_map.language != None: + if po_map.object_map.value + "_" + po_map.object_map.language in output_list[triples_map_id][current_output]: + return True + else: + return False + elif po_map.object_map.language_map != None: + if po_map.object_map.value + "_" + po_map.object_map.language_map in output_list[triples_map_id][current_output]: + return True + else: + return False + else: + return False + else: + return True + else: + return True + +def is_valid_url_syntax(url): + try: + result = urllib.parse.urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False def extract_subject_values(row,attr_list,format, parent_map = None): subject_attr = "" @@ -417,15 +520,7 @@ def extract_base(file): return base def encode_char(string): - encoded = "" - valid_char = ["~","#","/"]#,":"] - for s in string: - if s in valid_char: - encoded += s - elif s == "/": - encoded += "%2F" - else: - encoded += urllib.parse.quote(s) + encoded = urllib.parse.quote(string, safe='_-.~/:@&=+',encoding='utf-8') return encoded def combine_sublist(sublists, full_list): @@ -614,62 +709,129 @@ def files_sort(triples_map_list, ordered, config): else: source_predicate["XPath"][str(tp.data_source)] = {po.predicate_map.value : ""} else: - if tp.query == "None": - if tp.iterator == "None": - if config["datasets"]["dbType"] == "mysql": - database, query_list = translate_sql(tp) - elif config["datasets"]["dbType"] == "postgres": - database, query_list = translate_postgressql(tp) - query = query_list[0] + if "SPARQL" in tp.file_format: + if "csv" not in sorted_list: + if ".nt" in str(tp.data_source): + sorted_list["csv"] = {str(tp.data_source) : {tp.triples_map_id : tp}} + else: + sorted_list["csv"] = {"endpoint:" + str(tp.data_source) : {tp.triples_map_id : tp}} else: - if "select" in tp.iterator.lower(): - query = tp.iterator + if ".nt" in str(tp.data_source): + if str(tp.data_source) in sorted_list["csv"]: + sorted_list["csv"][str(tp.data_source)][tp.triples_map_id] = tp + else: + sorted_list["csv"][str(tp.data_source)] = {tp.triples_map_id : tp} else: + if "endpoint:" + str(tp.data_source) in sorted_list["csv"]: + sorted_list["csv"]["endpoint:" + str(tp.data_source)][tp.triples_map_id] = tp + else: + sorted_list["csv"]["endpoint:" + str(tp.data_source)] = {tp.triples_map_id : tp} + for po in tp.predicate_object_maps_list: + if po.predicate_map.value in general_predicates: + predicate = po.predicate_map.value + "_" + po.object_map.value + if predicate in predicate_list: + predicate_list[predicate] += 1 + else: + predicate_list[predicate] = 1 + else: + if po.predicate_map.value in predicate_list: + predicate_list[po.predicate_map.value] += 1 + else: + predicate_list[po.predicate_map.value] = 1 + if "csv" not in source_predicate: + if po.predicate_map.value in general_predicates: + predicate = po.predicate_map.value + "_" + po.object_map.value + if ".nt" in str(tp.data_source): + source_predicate["csv"] = {str(tp.data_source) : {predicate : ""}} + else: + source_predicate["csv"] = {"endpoint:" + str(tp.data_source) : {predicate : ""}} + else: + if ".nt" in str(tp.data_source): + source_predicate["csv"] = {str(tp.data_source) : {po.predicate_map.value : ""}} + else: + source_predicate["csv"] = {"endpoint:" + str(tp.data_source) : {po.predicate_map.value : ""}} + else: + if str(tp.data_source) in source_predicate["csv"]: + if po.predicate_map.value in general_predicates: + predicate = po.predicate_map.value + "_" + po.object_map.value + if ".nt" in str(tp.data_source): + source_predicate["csv"][str(tp.data_source)][predicate] = "" + else: + source_predicate["csv"]["endpoint:" + str(tp.data_source)][predicate] = "" + else: + if ".nt" in str(tp.data_source): + source_predicate["csv"][str(tp.data_source)][po.predicate_map.value] = "" + else: + source_predicate["csv"]["endpoint:" + str(tp.data_source)][po.predicate_map.value] = "" + else: + if po.predicate_map.value in general_predicates: + predicate = po.predicate_map.value + "_" + po.object_map.value + if ".nt" in str(tp.data_source): + source_predicate["csv"][str(tp.data_source)] = {predicate : ""} + else: + source_predicate["csv"]["endpoint:" + str(tp.data_source)] = {predicate : ""} + else: + if ".nt" in str(tp.data_source): + source_predicate["csv"][str(tp.data_source)] = {po.predicate_map.value : ""} + else: + source_predicate["csv"]["endpoint:" + str(tp.data_source)] = {po.predicate_map.value : ""} + else: + if tp.query == "None": + if tp.iterator == "None": if config["datasets"]["dbType"] == "mysql": database, query_list = translate_sql(tp) elif config["datasets"]["dbType"] == "postgres": database, query_list = translate_postgressql(tp) query = query_list[0] - else: - query = tp.query - if config["datasets"]["dbType"] not in sorted_list: - sorted_list[config["datasets"]["dbType"]] = {query: {tp.triples_map_id : tp}} - else: - if query in sorted_list[config["datasets"]["dbType"]]: - sorted_list[config["datasets"]["dbType"]][query][tp.triples_map_id] = tp - else: - sorted_list[config["datasets"]["dbType"]][query] = {tp.triples_map_id : tp} - for po in tp.predicate_object_maps_list: - if po.predicate_map.value in general_predicates: - predicate = po.predicate_map.value + "_" + po.object_map.value - if predicate in predicate_list: - predicate_list[predicate] += 1 else: - predicate_list[predicate] = 1 + if "select" in tp.iterator.lower(): + query = tp.iterator + else: + if config["datasets"]["dbType"] == "mysql": + database, query_list = translate_sql(tp) + elif config["datasets"]["dbType"] == "postgres": + database, query_list = translate_postgressql(tp) + query = query_list[0] else: - if po.predicate_map.value in predicate_list: - predicate_list[po.predicate_map.value] += 1 + query = tp.query + if config["datasets"]["dbType"] not in sorted_list: + sorted_list[config["datasets"]["dbType"]] = {query: {tp.triples_map_id : tp}} + else: + if query in sorted_list[config["datasets"]["dbType"]]: + sorted_list[config["datasets"]["dbType"]][query][tp.triples_map_id] = tp else: - predicate_list[po.predicate_map.value] = 1 - if config["datasets"]["dbType"] not in source_predicate: + sorted_list[config["datasets"]["dbType"]][query] = {tp.triples_map_id : tp} + for po in tp.predicate_object_maps_list: if po.predicate_map.value in general_predicates: predicate = po.predicate_map.value + "_" + po.object_map.value - source_predicate[config["datasets"]["dbType"]] = {query : {predicate : ""}} + if predicate in predicate_list: + predicate_list[predicate] += 1 + else: + predicate_list[predicate] = 1 else: - source_predicate[config["datasets"]["dbType"]] = {query : {po.predicate_map.value : ""}} - else: - if query in source_predicate[config["datasets"]["dbType"]]: + if po.predicate_map.value in predicate_list: + predicate_list[po.predicate_map.value] += 1 + else: + predicate_list[po.predicate_map.value] = 1 + if config["datasets"]["dbType"] not in source_predicate: if po.predicate_map.value in general_predicates: predicate = po.predicate_map.value + "_" + po.object_map.value - source_predicate[config["datasets"]["dbType"]][query][predicate] = "" + source_predicate[config["datasets"]["dbType"]] = {query : {predicate : ""}} else: - source_predicate[config["datasets"]["dbType"]][query][po.predicate_map.value] = "" + source_predicate[config["datasets"]["dbType"]] = {query : {po.predicate_map.value : ""}} else: - if po.predicate_map.value in general_predicates: - predicate = po.predicate_map.value + "_" + po.object_map.value - source_predicate[config["datasets"]["dbType"]][query] = {predicate : ""} + if query in source_predicate[config["datasets"]["dbType"]]: + if po.predicate_map.value in general_predicates: + predicate = po.predicate_map.value + "_" + po.object_map.value + source_predicate[config["datasets"]["dbType"]][query][predicate] = "" + else: + source_predicate[config["datasets"]["dbType"]][query][po.predicate_map.value] = "" else: - source_predicate[config["datasets"]["dbType"]][query] = {po.predicate_map.value : ""} + if po.predicate_map.value in general_predicates: + predicate = po.predicate_map.value + "_" + po.object_map.value + source_predicate[config["datasets"]["dbType"]][query] = {predicate : ""} + else: + source_predicate[config["datasets"]["dbType"]][query] = {po.predicate_map.value : ""} if tp.subject_map.rdf_class is not None: for rdf_type in tp.subject_map.rdf_class: predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + "_" + "<{}>".format(rdf_type) @@ -760,6 +922,8 @@ def string_substitution_json(string, pattern, row, term, ignore, iterator): match = reference_match.group(1) else: match = reference_match.group(1).split("[")[0] + if match[:2] == "$.": + match = match[2:] if "\\" in match: temp = match.split("{") match = temp[len(temp)-1] @@ -871,6 +1035,8 @@ def string_substitution_json(string, pattern, row, term, ignore, iterator): elif pattern == ".+": match = reference_match.group(0) if "[*]" in match: + if match[:2] == "$.": + match = match[2:] child_list = row[match.split("[*]")[0]] match = match.split(".")[1:] object_list = [] @@ -909,6 +1075,8 @@ def string_substitution_json(string, pattern, row, term, ignore, iterator): new_string = string return object_list else: + if match[:2] == "$.": + match = match[2:] if "." in match: if match in row: value = row[match] @@ -1011,7 +1179,15 @@ def string_substitution_xml(string, pattern, row, term, iterator, parent_map, na else: return None else: - return None + if match in row.attrib: + if row.attrib[match] is not None: + if re.search("^[\s|\t]*$", row.attrib[match]) is None: + new_string = new_string[:start + offset_current_substitution] + encode_char(row.attrib[match].strip()) + new_string[end + offset_current_substitution:] + offset_current_substitution = offset_current_substitution + len(encode_char(row.attrib[match].strip())) - (end - start) + else: + return None + else: + return None else: if temp_list: match = reference_match.group(1).split("[")[0] @@ -1183,6 +1359,8 @@ def string_substitution_xml(string, pattern, row, term, iterator, parent_map, na offset_current_substitution = offset_current_substitution + len(child.attrib[match]) - (end - start) string_list.append(new_string) else: + if "/text()" in match: + match = match.replace("/text()","") if match in iterator: if re.search("^[\s|\t]*$", row.text) is None: new_string = new_string[:start + offset_current_substitution] + "\"" + row.text.strip() + "\"" + new_string[ end + offset_current_substitution:] diff --git a/rdfizer/rdfizer/inner_functions.py b/rdfizer/rdfizer/inner_functions.py new file mode 100644 index 0000000..773481b --- /dev/null +++ b/rdfizer/rdfizer/inner_functions.py @@ -0,0 +1,680 @@ +import os +import re +import datetime +import sys +import urllib +import math +from .functions import * +global inner_join_table +inner_join_table = {} +global general_predicates +general_predicates = {"http://www.w3.org/2000/01/rdf-schema#subClassOf": "", + "http://www.w3.org/2002/07/owl#sameAs": "", + "http://www.w3.org/2000/01/rdf-schema#seeAlso": "", + "http://www.w3.org/2000/01/rdf-schema#subPropertyOf": ""} + +def inner_hash_maker(parent_data, parent_subject, child_object, quoted, triples_map_list): + global blank_message + hash_table = {} + for row in parent_data: + if quoted == "": + if child_object.parent[0] in row.keys(): + if row[child_object.parent[0]] in hash_table: + if duplicate == "yes": + if parent_subject.subject_map.subject_mapping_type == "reference": + value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, + parent_subject.iterator) + if value != None: + if "http" in value and "<" not in value: + value = "<" + value[1:-1] + ">" + elif "http" in value and "<" in value: + value = value[1:-1] + if value not in hash_table[row[child_object.parent[0]]]: + hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: + if string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, + parent_subject.iterator) != None: + value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", + ignore, parent_subject.iterator) + if value != None: + if parent_subject.subject_map.term_type != None: + if "BlankNode" in parent_subject.subject_map.term_type: + if "/" in value: + value = "_:" + encode_char(value.replace("/", "2F")).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + if blank_message: + logger.warning( + "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + value = "_:" + encode_char(value).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + else: + value = "<" + value + ">" + hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: + if parent_subject.subject_map.subject_mapping_type == "reference": + value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, + parent_subject.iterator) + if "http" in value and "<" not in value: + value = "<" + value[1:-1] + ">" + elif "http" in value and "<" in value: + value = value[1:-1] + hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: + value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, + parent_subject.iterator) + if value != None: + if parent_subject.subject_map.term_type != None: + if "BlankNode" in parent_subject.subject_map.term_type: + if "/" in value: + value = "_:" + encode_char(value.replace("/", "2F")).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + if blank_message: + logger.warning( + "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + value = "_:" + encode_char(value).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + else: + value = "<" + value + ">" + hash_table[row[child_object.parent[0]]].update({value: "object"}) + + else: + if parent_subject.subject_map.subject_mapping_type == "reference": + value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, + parent_subject.iterator) + if value != None: + if "http" in value and "<" not in value: + value = "<" + value[1:-1] + ">" + elif "http" in value and "<" in value: + value = value[1:-1] + hash_table.update({row[child_object.parent[0]]: {value: "object"}}) + else: + value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, + parent_subject.iterator) + if value != None: + if parent_subject.subject_map.term_type != None: + if "BlankNode" in parent_subject.subject_map.term_type: + if "/" in value: + value = "_:" + encode_char(value.replace("/", "2F")).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + if blank_message: + logger.warning( + "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + value = "_:" + encode_char(value).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + else: + value = "<" + value + ">" + hash_table.update({row[child_object.parent[0]]: {value: "object"}}) + else: + for triples in inner_semantify_file(parent_subject, triples_map_list, ",", row, base): + if triples != None: + if isinstance(child_object.parent,list): + parent = child_object.parent[0] + else: + parent = child_object.parent + if row[parent] in hash_table: + if duplicate == "yes": + if triples not in hash_table[row[parent]]: + hash_table[row[parent]].update({triples : "subject"}) + else: + hash_table[row[parent]].update({triples : "subject"}) + else: + hash_table.update({row[parent] : {triples : "subject"}}) + if isinstance(child_object.child,list): + join_table.update({parent_subject.triples_map_id + "_" + child_object.child[0] : hash_table}) + else: + join_table.update({"quoted_" + parent_subject.triples_map_id + "_" + child_object.child[0] : hash_table}) + +def inner_semantify_file(triples_map, triples_map_list, delimiter, row, base): + object_list = [] + subject_list = [] + triples_list = [] + + if triples_map.subject_map.subject_mapping_type == "template": + subject_value = string_substitution(triples_map.subject_map.value, "{(.+?)}", row, "subject", "yes", triples_map.iterator) + if triples_map.subject_map.term_type is None: + if triples_map.subject_map.condition == "": + + try: + subject = "<" + subject_value + ">" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "<" + subject_value + ">" + except: + subject = None + else: + if "IRI" in triples_map.subject_map.term_type: + subject_value = string_substitution(triples_map.subject_map.value, "{(.+?)}", row, "subject", "yes", triples_map.iterator) + if triples_map.subject_map.condition == "": + + try: + if "http" not in subject_value: + subject = "<" + base + subject_value + ">" + else: + subject = "<" + encode_char(subject_value) + ">" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + if "http" not in subject_value: + subject = subject = "<" + base + subject_value + ">" + else: + subject = "<" + subject_value + ">" + except: + subject = None + + elif "BlankNode" in triples_map.subject_map.term_type: + if triples_map.subject_map.condition == "": + try: + if "/" in subject_value: + subject = "_:" + encode_char(subject_value.replace("/","2F")).replace("%","") + if "." in subject: + subject = subject.replace(".","2E") + if blank_message: + print("Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + subject = "_:" + encode_char(subject_value).replace("%","") + if "." in subject: + subject = subject.replace(".","2E") + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "_:" + subject_value + except: + subject = None + elif "Literal" in triples_map.subject_map.term_type: + subject = None + else: + if triples_map.subject_map.condition == "": + + try: + subject = "<" + subject_value + ">" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "<" + subject_value + ">" + except: + subject = None + elif "reference" in triples_map.subject_map.subject_mapping_type: + subject_value = string_substitution(triples_map.subject_map.value, ".+", row, "subject","yes" , triples_map.iterator) + if subject_value != None: + subject_value = subject_value[1:-1] + if triples_map.subject_map.condition == "": + if " " not in subject_value: + if "BlankNode" in triples_map.subject_map.term_type: + subject = "_:" + subject_value + else: + if "http" not in subject_value: + subject = "<" + base + subject_value + ">" + else: + subject = "<" + subject_value + ">" + else: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + if "http" not in subject_value: + subject = "<" + base + subject_value + ">" + else: + subject = "<" + subject_value + ">" + except: + subject = None + + elif "constant" in triples_map.subject_map.subject_mapping_type: + subject = "<" + subject_value + ">" + + elif "quoted triples map" in triples_map.subject_map.subject_mapping_type: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == triples_map.subject_map.value: + if triples_map_element.data_source != triples_map.data_source: + if triples_map.subject_map.parent[0] != None: + if ("quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child[0]) not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + with open(str(triples_map_element.data_source), "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=',') + hash_maker(data, triples_map_element, triples_map.subject_map, "quoted", triples_map_list) + else: + pass + if row[triples_map.subject_map.child[0]] in join_table["quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child[0]]: + subject_list = join_table["quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child[0]][row[triples_map.subject_map.child[0]]] + else: + subject_list = inner_semantify_file(triples_map_element, triples_map_list, delimiter, row, base) + subject = None + + else: + if triples_map.subject_map.condition == "": + + try: + subject = "\"" + triples_map.subject_map.value + "\"" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "\"" + triples_map.subject_map.value + "\"" + except: + subject = None + + + if triples_map.subject_map.rdf_class != None and subject != None: + predicate = "" + for rdf_class in triples_map.subject_map.rdf_class: + if rdf_class != None: + obj = "<{}>".format(rdf_class) + rdf_type = subject + " " + predicate + " " + obj + for graph in triples_map.subject_map.graph: + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + rdf_type = rdf_type[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject","yes", triples_map.iterator) + "> .\n" + else: + rdf_type = rdf_type[:-2] + " <" + graph + "> .\n" + triples_list.append(rdf_type) + + + for predicate_object_map in triples_map.predicate_object_maps_list: + if predicate_object_map.predicate_map.mapping_type == "constant" or predicate_object_map.predicate_map.mapping_type == "constant shortcut": + predicate = "<" + predicate_object_map.predicate_map.value + ">" + elif predicate_object_map.predicate_map.mapping_type == "template": + if predicate_object_map.predicate_map.condition != "": + #field, condition = condition_separetor(predicate_object_map.predicate_map.condition) + #if row[field] == condition: + try: + predicate = "<" + string_substitution(predicate_object_map.predicate_map.value, "{(.+?)}", row, "predicate","yes", triples_map.iterator) + ">" + except: + predicate = None + #else: + # predicate = None + else: + try: + predicate = "<" + string_substitution(predicate_object_map.predicate_map.value, "{(.+?)}", row, "predicate","yes", triples_map.iterator) + ">" + except: + predicate = None + elif predicate_object_map.predicate_map.mapping_type == "reference": + if predicate_object_map.predicate_map.condition != "": + #field, condition = condition_separetor(predicate_object_map.predicate_map.condition) + #if row[field] == condition: + predicate = string_substitution(predicate_object_map.predicate_map.value, ".+", row, "predicate","yes", triples_map.iterator) + #else: + # predicate = None + else: + predicate = string_substitution(predicate_object_map.predicate_map.value, ".+", row, "predicate","yes", triples_map.iterator) + predicate = "<" + predicate[1:-1] + ">" + else: + predicate = None + + if predicate_object_map.object_map.mapping_type == "constant" or predicate_object_map.object_map.mapping_type == "constant shortcut": + if "/" in predicate_object_map.object_map.value: + object = "<" + predicate_object_map.object_map.value + ">" + else: + object = "\"" + predicate_object_map.object_map.value + "\"" + if predicate_object_map.object_map.datatype != None: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.mapping_type == "template": + try: + if predicate_object_map.object_map.term is None: + object = "<" + string_substitution(predicate_object_map.object_map.value, "{(.+?)}", row, "object","yes", triples_map.iterator) + ">" + elif "IRI" in predicate_object_map.object_map.term: + object = "<" + string_substitution(predicate_object_map.object_map.value, "{(.+?)}", row, "object","yes", triples_map.iterator) + ">" + elif "BlankNode" in predicate_object_map.object_map.term: + object = "_:" + string_substitution(predicate_object_map.object_map.value, "{(.+?)}", row, "object","yes", triples_map.iterator) + if "/" in object: + object = object.replace("/","2F") + if blank_message: + print("Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + if "." in object: + object = object.replace(".","2E") + object = encode_char(object) + else: + object = "\"" + string_substitution(predicate_object_map.object_map.value, "{(.+?)}", row, "object","yes", triples_map.iterator) + "\"" + if predicate_object_map.object_map.datatype != None: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.language != None: + if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language : + object += "@es" + elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language : + object += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object += "@"+predicate_object_map.object_map.language + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object","yes", triples_map.iterator) + if lang != None: + object += "@" + string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object","yes", triples_map.iterator)[1:-1] + except TypeError: + object = None + elif predicate_object_map.object_map.mapping_type == "reference": + object = string_substitution(predicate_object_map.object_map.value, ".+", row, "object","yes", triples_map.iterator) + if object != None: + if "\\" in object[1:-1]: + object = "\"" + object[1:-1].replace("\\","\\\\") + "\"" + if "'" in object[1:-1]: + object = "\"" + object[1:-1].replace("'","\\\\'") + "\"" + if "\n" in object: + object = object.replace("\n","\\n") + if predicate_object_map.object_map.datatype != None: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.language != None: + if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language : + object += "@es" + elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language : + object += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object += "@"+predicate_object_map.object_map.language + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object","yes", triples_map.iterator) + if lang != None: + object += "@"+ string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object","yes", triples_map.iterator)[1:-1] + elif predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + if " " not in object: + object = "\"" + object[1:-1].replace("\\\\'","'") + "\"" + object = "<" + encode_char(object[1:-1]) + ">" + else: + object = None + elif "BlankNode" in predicate_object_map.object_map.term: + if " " not in object: + object = "_:" + object[1:-1] + else: + object = None + elif predicate_object_map.object_map.mapping_type == "parent triples map": + if subject != None: + for triples_map_inner in triples_map_list: + if triples_map_inner.triples_map_id == predicate_object_map.object_map.value: + if triples_map_inner.data_source != triples_map.data_source: + if len(predicate_object_map.object_map.child) == 1: + if (triples_map_inner.triples_map_id + "_" + predicate_object_map.object_map.child[0]) not in join_table: + if str(triples_map_inner.file_format).lower() == "csv" or triples_map.file_format == "JSONPath": + with open(str(triples_map_inner.data_source), "r") as input_file_descriptor: + if str(triples_map.file_format).lower() == "csv": + reader = pd.read_csv(str(triples_map_inner.data_source), dtype = str)#, encoding = "ISO-8859-1") + reader = reader.where(pd.notnull(reader), None) + reader = reader.drop_duplicates(keep ='first') + data = reader.to_dict(orient='records') + hash_maker(data, triples_map_inner, predicate_object_map.object_map) + else: + data = json.load(input_file_descriptor) + if triples_map_inner.iterator: + if triples_map_inner.iterator != "None" and triples_map_inner.iterator != "$.[*]": + join_iterator(data, triples_map_inner.iterator, triples_map_inner, predicate_object_map.object_map) + else: + if isinstance(data, list): + hash_maker(data, triples_map_inner, predicate_object_map.object_map) + elif len(data) < 2: + hash_maker(data[list(data.keys())[0]], triples_map_inner, predicate_object_map.object_map) + else: + if isinstance(data, list): + hash_maker(data, triples_map_inner, predicate_object_map.object_map) + elif len(data) < 2: + hash_maker(data[list(data.keys())[0]], triples_map_inner, predicate_object_map.object_map) + + elif triples_map.file_format == "XPath": + with open(str(triples_map.data_source), "r") as input_file_descriptor: + child_tree = ET.parse(input_file_descriptor) + child_root = child_tree.getroot() + hash_maker_xml(child_root, triples_map_inner, predicate_object_map.object_map) + else: + database, query_list = translate_sql(triples_map) + db = connector.connect(host=host, port=int(port), user=user, password=password) + cursor = db.cursor(buffered=True) + cursor.execute("use " + database) + for query in query_list: + cursor.execute(query) + hash_maker_array(cursor, triples_map_inner, predicate_object_map.object_map) + + if sublist(predicate_object_map.object_map.child,row.keys()): + if child_list_value(predicate_object_map.object_map.child,row) in join_table[triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)]: + object_list = join_table[triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)][child_list_value(predicate_object_map.object_map.child,row)] + else: + if no_update: + if str(triples_map_inner.file_format).lower() == "csv" or triples_map_inner.file_format == "JSONPath": + with open(str(triples_map_inner.data_source), "r") as input_file_descriptor: + if str(triples_map_inner.file_format).lower() == "csv": + reader = pd.read_csv(str(triples_map_inner.data_source), dtype = str)#, encoding = "ISO-8859-1") + reader = reader.where(pd.notnull(reader), None) + reader = reader.drop_duplicates(keep ='first') + data = reader.to_dict(orient='records') + hash_update(data, triples_map_inner, predicate_object_map.object_map, triples_map_inner.triples_map_id + "_" + predicate_object_map.object_map.child[0]) + else: + data = json.load(input_file_descriptor) + if triples_map_inner.iterator: + if triples_map_inner.iterator != "None" and triples_map_inner.iterator != "$.[*]": + join_iterator(data, triples_map_inner.iterator, triples_map_inner, predicate_object_map.object_map) + else: + if isinstance(data, list): + hash_maker(data, triples_map_inner, predicate_object_map.object_map) + elif len(data) < 2: + hash_maker(data[list(data.keys())[0]], triples_map_inner, predicate_object_map.object_map) + else: + if isinstance(data, list): + hash_maker(data, triples_map_inner, predicate_object_map.object_map) + elif len(data) < 2: + hash_maker(data[list(data.keys())[0]], triples_map_inner, predicate_object_map.object_map) + if child_list_value(predicate_object_map.object_map.child,row) in join_table[triples_map_inner.triples_map_id + "_" + predicate_object_map.object_map.child[0]]: + object_list = join_table[triples_map_inner.triples_map_id + "_" + predicate_object_map.object_map.child[0]][row[predicate_object_map.object_map.child[0]]] + else: + object_list = [] + no_update = False + object = None + else: + if (triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)) not in join_table: + if str(triples_map_inner.file_format).lower() == "csv" or triples_map_inner.file_format == "JSONPath": + with open(str(triples_map_inner.data_source), "r") as input_file_descriptor: + if str(triples_map_inner.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=delimiter) + hash_maker_list(data, triples_map_inner, predicate_object_map.object_map) + else: + data = json.load(input_file_descriptor) + if isinstance(data, list): + hash_maker_list(data, triples_map_inner, predicate_object_map.object_map) + elif len(data) < 2: + hash_maker_list(data[list(data.keys())[0]], triples_map_inner, predicate_object_map.object_map) + + elif triples_map_inner.file_format == "XPath": + with open(str(triples_map_inner.data_source), "r") as input_file_descriptor: + child_tree = ET.parse(input_file_descriptor) + child_root = child_tree.getroot() + hash_maker_xml(child_root, triples_map_inner, predicate_object_map.object_map) + else: + database, query_list = translate_sql(triples_map) + db = connector.connect(host=host, port=int(port), user=user, password=password) + cursor = db.cursor(buffered=True) + cursor.execute("use " + database) + for query in query_list: + cursor.execute(query) + hash_maker_array(cursor, triples_map_inner, predicate_object_map.object_map) + if sublist(predicate_object_map.object_map.child,row.keys()): + if child_list_value(predicate_object_map.object_map.child,row) in join_table[triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)]: + object_list = join_table[triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)][child_list_value(predicate_object_map.object_map.child,row)] + else: + object_list = [] + object = None + else: + if predicate_object_map.object_map.parent != None: + if predicate_object_map.object_map.parent[0] != predicate_object_map.object_map.child[0]: + if (triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)) not in join_table: + with open(str(triples_map_inner.data_source), "r") as input_file_descriptor: + if str(triples_map_inner.file_format).lower() == "csv": + parent_data = csv.DictReader(input_file_descriptor, delimiter=delimiter) + hash_maker_list(parent_data, triples_map_inner, predicate_object_map.object_map) + else: + parent_data = json.load(input_file_descriptor) + if isinstance(parent_data, list): + hash_maker_list(parent_data, triples_map_inner, predicate_object_map.object_map) + else: + hash_maker_list(parent_data[list(parent_data.keys())[0]], triples_map_inner, predicate_object_map.object_map) + if sublist(predicate_object_map.object_map.child,row.keys()): + if child_list_value(predicate_object_map.object_map.child,row) in join_table[triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)]: + object_list = join_table[triples_map_inner.triples_map_id + "_" + child_list(predicate_object_map.object_map.child)][child_list_value(predicate_object_map.object_map.child,row)] + else: + object_list = [] + object = None + else: + try: + object = "<" + string_substitution(triples_map_inner.subject_map.value, "{(.+?)}", row, "object","yes", triples_map.iterator) + ">" + except TypeError: + object = None + else: + try: + object = "<" + string_substitution(triples_map_inner.subject_map.value, "{(.+?)}", row, "object","yes", triples_map.iterator) + ">" + except TypeError: + object = None + break + else: + continue + else: + object = None + elif "quoted triples map" in predicate_object_map.object_map.mapping_type: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == predicate_object_map.object_map.value: + if triples_map_element.data_source != triples_map.data_source: + if predicate_object_map.object_map.parent[0] != None: + if ("quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]) not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + with open(str(triples_map_element.data_source), "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=',') + inner_hash_maker(data, triples_map_element, predicate_object_map.object_map, "quoted", triples_map_list) + else: + pass + if row[predicate_object_map.object_map.child[0]] in join_table["quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]]: + object_list = join_table["quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]][row[predicate_object_map.object_map.child[0]]] + else: + object_list = inner_semantify_file(triples_map_element, triples_map_list, delimiter, row, base) + object = None + else: + object = None + if predicate != None and object != None and subject != None: + for graph in triples_map.subject_map.graph: + triple = subject + " " + predicate + " " + object + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + graph + ">" + triples_list.append(triple) + if predicate[1:-1] in predicate_object_map.graph: + triple = subject + " " + predicate + " " + object + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">" + triples_list.append(triple) + elif predicate != None and subject != None and object_list: + for obj in object_list: + if obj != None: + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">" + else: + triple = subject + " " + predicate + " " + obj + else: + if "quoted triples map" in predicate_object_map.object_map.mapping_type: + triple = subject + " " + predicate + " <<" + obj + ">>" + else: + triple = subject + " " + predicate + " " + obj + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + graph + ">" + triples_list.append(triple) + + if predicate[1:-1] in predicate_object_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">" + else: + triple = subject + " " + predicate + " " + obj + else: + triple = subject + " " + predicate + " " + obj + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">" + triples_list.append(triple) + object_list = [] + elif predicate != None and object != None and subject_list: + for subj in subject_list: + if subj != None: + for graph in triples_map.subject_map.graph: + triple = "<< " + subj + ">> " + predicate + " " + object + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + graph + ">" + triples_list.append(triple) + + if predicate[1:-1] in predicate_object_map.graph: + triple = "<< " + subj + " >> " + predicate + " " + object + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">" + triples_list.append(triple) + subject_list = [] + elif predicate != None and object_list and subject_list: + for subj in subject_list: + for obj in object_list: + if subj != None: + for graph in triples_map.subject_map.graph: + if "quoted triples map" in predicate_object_map.object_map.mapping_type: + triple = "<< " + subj + ">> " + predicate + " <<" + obj + ">>" + else: + triple = "<< " + subj + ">> " + predicate + " " + obj + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + graph + ">" + triples_list.append(triple) + + if predicate[1:-1] in predicate_object_map.graph: + triple = "<< " + subj + ">> " + predicate + " " + obj + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject","yes", triples_map.iterator) + ">" + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">" + subject_list = [] + object_list = [] + else: + continue + return triples_list \ No newline at end of file diff --git a/rdfizer/rdfizer/mapping_functions.py b/rdfizer/rdfizer/mapping_functions.py new file mode 100644 index 0000000..2dabac6 --- /dev/null +++ b/rdfizer/rdfizer/mapping_functions.py @@ -0,0 +1,134 @@ +import re +import sys +import os +from .fnml_functions import * + +def new_inner_function(row,function,triples_map): + functions = [] + keys = [] + for func_map in triples_map.func_map_list: + if func_map.func_map_id == function: + for param in func_map.parameters: + if func_map.parameters[param]["type"] == "function": + for fm in triples_map.func_map_list: + if fm.func_map_id == func_map.parameters[param]["value"]: + functions.append(func_map.parameters[param]["value"]) + func_map.parameters[param]["type"] = "reference" + elif func_map.parameters[param]["type"] == "template": + if "{" in func_map.parameters[param]["value"]: + attr_list = func_map.parameters[param]["value"].split("{") + for attr in attr_list: + if "}" in attr: + keys.append(attr.split("}")[0]) + elif func_map.parameters[param]["type"] == "reference": + keys.append(func_map.parameters[param]["value"]) + if functions: + temp_row = {} + for func in functions: + value = new_inner_function(row,func,triples_map) + temp_row[func] = value + for key in keys: + temp_row[key] = row[key] + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + return execute_function(temp_row,None,current_func) + else: + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + return execute_function(row,None,current_func) + + +def inner_function(row,dic,triples_map_list): + + functions = [] + keys = [] + for attr in dic["inputs"]: + if ("reference function" in attr[1]): + functions.append(attr[0]) + elif "template" in attr[1]: + for value in attr[0].split("{"): + if "}" in value: + keys.append(value.split("}")[0]) + elif "constant" not in attr[1]: + keys.append(attr[0]) + if functions: + temp_dics = {} + for function in functions: + for tp in triples_map_list: + if tp.triples_map_id == function: + temp_dic = create_dictionary(tp) + current_func = {"inputs":temp_dic["inputs"], + "function":temp_dic["executes"], + "func_par":temp_dic, + "termType":True} + temp_dics[function] = current_func + temp_row = {} + for dics in temp_dics: + value = inner_function(row,temp_dics[dics],triples_map_list) + temp_row[dics] = value + for key in keys: + temp_row[key] = row[key] + return execute_function(temp_row,None,dic) + else: + return execute_function(row,None,dic) + +def inner_values(row,dic,triples_map_list): + values = "" + for inputs in dic["inputs"]: + if "reference" == inputs[1]: + values += str(row[inputs[0]]) + elif "template" == inputs[1]: + for string in inputs[0].split("{"): + if "}" in string: + values += str(row[string.split("}")[0]]) + elif "reference function" == inputs[1]: + temp_dics = {} + for tp in triples_map_list: + if tp.triples_map_id == inputs[0]: + temp_dic = create_dictionary(tp) + current_func = {"inputs":temp_dic["inputs"], + "function":temp_dic["executes"], + "func_par":temp_dic, + "termType":True} + values += inner_values(row,temp_dic,triples_map_list) + return values + +def inner_function_exists(inner_func, inner_functions): + for inner_function in inner_functions: + if inner_func["id"] in inner_function["id"]: + return False + return True + +def create_dictionary(triple_map): + dic = {} + inputs = [] + for tp in triple_map.predicate_object_maps_list: + if "#" in tp.predicate_map.value: + key = tp.predicate_map.value.split("#")[1] + tp_type = tp.predicate_map.mapping_type + elif "/" in tp.predicate_map.value: + key = tp.predicate_map.value.split("/")[len(tp.predicate_map.value.split("/"))-1] + tp_type = tp.predicate_map.mapping_type + if "constant" in tp.object_map.mapping_type: + value = tp.object_map.value + tp_type = tp.object_map.mapping_type + if "template" in tp.object_map.mapping_type: + value = tp.object_map.value + tp_type = tp.object_map.mapping_type + elif "executes" in tp.predicate_map.value: + if "#" in tp.object_map.value: + value = tp.object_map.value.split("#")[1] + tp_type = tp.object_map.mapping_type + elif "/" in tp.object_map.value: + value = tp.object_map.value.split("/")[len(tp.object_map.value.split("/"))-1] + tp_type = tp.object_map.mapping_type + else: + value = tp.object_map.value + tp_type = tp.object_map.mapping_type + + dic.update({key : value}) + if (key != "executes") and ([value,tp_type,key] not in inputs): + inputs.append([value,tp_type,key]) + + dic["inputs"] = inputs + return dic \ No newline at end of file diff --git a/rdfizer/rdfizer/semantify.py b/rdfizer/rdfizer/semantify.py index fa1b457..45b2cee 100755 --- a/rdfizer/rdfizer/semantify.py +++ b/rdfizer/rdfizer/semantify.py @@ -13,7 +13,17 @@ import pandas as pd import xml.etree.ElementTree as ET from urllib.request import urlopen +import gzip +import requests +import shutil +import zipfile +import io +import tarfile +from SPARQLWrapper import SPARQLWrapper, JSON from .functions import * +from .fnml_functions import * +from .mapping_functions import * +from .inner_functions import * import logging try: @@ -64,6 +74,12 @@ base = "" global blank_message blank_message = True +global delimiter +delimiter = {} +global logical_dump +logical_dump = {} +global current_logical_dump +current_logical_dump = "" global general_predicates general_predicates = {"http://www.w3.org/2000/01/rdf-schema#subClassOf": "", "http://www.w3.org/2002/07/owl#sameAs": "", @@ -186,7 +202,7 @@ def dictionary_table_update(resource): id_number += 1 -def join_iterator(data, iterator, parent, child): +def join_iterator(data, iterator, parent, child, triples_map_list): if iterator != "": new_iterator = "" temp_keys = iterator.split(".") @@ -219,28 +235,28 @@ def join_iterator(data, iterator, parent, child): row = row[list(row.keys())[0]] if isinstance(row, list): for sub_row in row: - join_iterator(sub_row, iterator, parent, child) + join_iterator(sub_row, iterator, parent, child, triples_map_list) executed = False break elif isinstance(row, str): row = [] break else: - join_iterator(row[list(row.keys())[0]], "", parent, child) + join_iterator(row[list(row.keys())[0]], "", parent, child, triples_map_list) else: path = jsonpath_find(temp_keys[len(temp_keys) - 1], row, "", []) for key in path[0].split("."): if key in temp_keys: - join_iterator(row[key], "", parent, child) + join_iterator(row[key], "", parent, child, triples_map_list) elif key in row: row = row[key] if isinstance(row, list): for sub_row in row: - join_iterator(sub_row, iterator, parent, child) + join_iterator(sub_row, iterator, parent, child, triples_map_list) executed = False break elif isinstance(row, dict): - join_iterator(row, iterator, parent, child) + join_iterator(row, iterator, parent, child, triples_map_list) executed = False break elif isinstance(row, str): @@ -249,23 +265,23 @@ def join_iterator(data, iterator, parent, child): if new_iterator != ".": if "*" == new_iterator[-2]: for sub_row in row: - join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child) + join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child, triples_map_list) executed = False break if "[*][*]" in new_iterator: for sub_row in row: for sub_sub_row in row[sub_row]: - join_iterator(sub_sub_row, iterator.replace(new_iterator[:-1], ""), parent, child) + join_iterator(sub_sub_row, iterator.replace(new_iterator[:-1], ""), parent, child, triples_map_list) executed = False break if isinstance(row, list): for sub_row in row: - join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child) + join_iterator(sub_row, iterator.replace(new_iterator[:-1], ""), parent, child, triples_map_list) executed = False break else: if parent.triples_map_id + "_" + child.child[0] not in join_table: - hash_maker([data], parent, child) + hash_maker([data], parent, child,"", triples_map_list) else: hash_update([data], parent, child, parent.triples_map_id + "_" + child.child[0]) @@ -329,28 +345,59 @@ def hash_update(parent_data, parent_subject, child_object, join_id): join_table[join_id].update(hash_table) -def hash_maker(parent_data, parent_subject, child_object): +def hash_maker(parent_data, parent_subject, child_object, quoted, triples_map_list): global blank_message hash_table = {} for row in parent_data: - if child_object.parent[0] in row.keys(): - if row[child_object.parent[0]] in hash_table: - if duplicate == "yes": - if parent_subject.subject_map.subject_mapping_type == "reference": - value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, - parent_subject.iterator) - if value != None: + if quoted == "": + if child_object.parent[0] in row.keys(): + if row[child_object.parent[0]] in hash_table: + if duplicate == "yes": + if parent_subject.subject_map.subject_mapping_type == "reference": + value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, + parent_subject.iterator) + if value != None: + if "http" in value and "<" not in value: + value = "<" + value[1:-1] + ">" + elif "http" in value and "<" in value: + value = value[1:-1] + if value not in hash_table[row[child_object.parent[0]]]: + hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: + if string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, + parent_subject.iterator) != None: + value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", + ignore, parent_subject.iterator) + if value != None: + if parent_subject.subject_map.term_type != None: + if "BlankNode" in parent_subject.subject_map.term_type: + if "/" in value: + value = "_:" + encode_char(value.replace("/", "2F")).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + if blank_message: + logger.warning( + "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + value = "_:" + encode_char(value).replace("%", "") + if "." in value: + value = value.replace(".", "2E") + else: + value = "<" + value + ">" + hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: + if parent_subject.subject_map.subject_mapping_type == "reference": + value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, + parent_subject.iterator) if "http" in value and "<" not in value: value = "<" + value[1:-1] + ">" elif "http" in value and "<" in value: value = value[1:-1] - if value not in hash_table[row[child_object.parent[0]]]: hash_table[row[child_object.parent[0]]].update({value: "object"}) - else: - if string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, - parent_subject.iterator) != None: - value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", - ignore, parent_subject.iterator) + else: + value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, + parent_subject.iterator) if value != None: if parent_subject.subject_map.term_type != None: if "BlankNode" in parent_subject.subject_map.term_type: @@ -369,15 +416,17 @@ def hash_maker(parent_data, parent_subject, child_object): else: value = "<" + value + ">" hash_table[row[child_object.parent[0]]].update({value: "object"}) + else: if parent_subject.subject_map.subject_mapping_type == "reference": value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, parent_subject.iterator) - if "http" in value and "<" not in value: - value = "<" + value[1:-1] + ">" - elif "http" in value and "<" in value: - value = value[1:-1] - hash_table[row[child_object.parent[0]]].update({value: "object"}) + if value != None: + if "http" in value and "<" not in value: + value = "<" + value[1:-1] + ">" + elif "http" in value and "<" in value: + value = value[1:-1] + hash_table.update({row[child_object.parent[0]]: {value: "object"}}) else: value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, parent_subject.iterator) @@ -398,40 +447,26 @@ def hash_maker(parent_data, parent_subject, child_object): value = value.replace(".", "2E") else: value = "<" + value + ">" - hash_table[row[child_object.parent[0]]].update({value: "object"}) - - else: - if parent_subject.subject_map.subject_mapping_type == "reference": - value = string_substitution(parent_subject.subject_map.value, ".+", row, "object", ignore, - parent_subject.iterator) - if value != None: - if "http" in value and "<" not in value: - value = "<" + value[1:-1] + ">" - elif "http" in value and "<" in value: - value = value[1:-1] - hash_table.update({row[child_object.parent[0]]: {value: "object"}}) - else: - value = string_substitution(parent_subject.subject_map.value, "{(.+?)}", row, "object", ignore, - parent_subject.iterator) - if value != None: - if parent_subject.subject_map.term_type != None: - if "BlankNode" in parent_subject.subject_map.term_type: - if "/" in value: - value = "_:" + encode_char(value.replace("/", "2F")).replace("%", "") - if "." in value: - value = value.replace(".", "2E") - if blank_message: - logger.warning( - "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") - blank_message = False - else: - value = "_:" + encode_char(value).replace("%", "") - if "." in value: - value = value.replace(".", "2E") + hash_table.update({row[child_object.parent[0]]: {value: "object"}}) + else: + for triples in inner_semantify_file(parent_subject, triples_map_list, ",", row, base): + if triples != None: + if isinstance(child_object.parent,list): + parent = child_object.parent[0] + else: + parent = child_object.parent + if row[parent] in hash_table: + if duplicate == "yes": + if triples not in hash_table[row[parent]]: + hash_table[row[parent]].update({triples : "subject"}) else: - value = "<" + value + ">" - hash_table.update({row[child_object.parent[0]]: {value: "object"}}) - join_table.update({parent_subject.triples_map_id + "_" + child_object.child[0]: hash_table}) + hash_table[row[parent]].update({triples : "subject"}) + else: + hash_table.update({row[parent] : {triples : "subject"}}) + if isinstance(child_object.child,list): + join_table.update({parent_subject.triples_map_id + "_" + child_object.child[0] : hash_table}) + else: + join_table.update({"quoted_" + parent_subject.triples_map_id + "_" + child_object.child : hash_table}) def hash_maker_list(parent_data, parent_subject, child_object): @@ -922,8 +957,10 @@ def mappings_expansion(triples_map_list): subject_map = triples_map.subject_map else: subject_map = tm.SubjectMap(triples_map.subject_map.value, triples_map.subject_map.condition, - triples_map.subject_map.subject_mapping_type, [None], - triples_map.subject_map.term_type, triples_map.subject_map.graph) + triples_map.subject_map.subject_mapping_type, + triples_map.subject_map.parent,triples_map.child, [None], + triples_map.subject_map.term_type, triples_map.subject_map.graph, + triples_map.func_result) if po.object_map.mapping_type == "parent triples map": if po.object_map.child != None: for triples_map_element in triples_map_list: @@ -954,7 +991,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -964,7 +1004,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -973,7 +1016,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -982,7 +1028,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] elif str(triples_map.file_format).lower() == "csv": if triples_map.data_source == triples_map_element.data_source: if po.object_map.child[0] == po.object_map.parent[0]: @@ -1005,7 +1054,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1014,7 +1066,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1023,7 +1078,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: if triples_map.query == triples_map_element.query or triples_map.tablename == triples_map_element.tablename: if po.object_map.child[0] == po.object_map.parent[0]: @@ -1046,7 +1104,11 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1055,7 +1117,11 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type, + triples_map.mappings_type)] else: predicate_object = tm.PredicateObjectMap(po.predicate_map, po.object_map, po.graph) @@ -1064,7 +1130,11 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type, + triples_map.mappings_type)] break else: for triples_map_element in triples_map_list: @@ -1093,7 +1163,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: if len(triples_map_element.predicate_object_maps_list) > 1: po.object_map.value = po.object_map.value + "_1" @@ -1105,7 +1178,10 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] elif str(triples_map.file_format).lower() == "csv": if triples_map_element.subject_map.subject_mapping_type == "template": object_map = tm.ObjectMap("template", @@ -1126,13 +1202,19 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: new_list += [tm.TriplesMap(triples_map.triples_map_id + "_" + str(i), triples_map.data_source, subject_map, [po], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: if ( triples_map.query != "None" and triples_map_element.query != "None" and triples_map.query == triples_map_element.query) or ( @@ -1156,18 +1238,26 @@ def mappings_expansion(triples_map_list): [predicate_object], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] else: new_list += [tm.TriplesMap(triples_map.triples_map_id + "_" + str(i), triples_map.data_source, subject_map, [po], triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query, + triples_map.function, + triples_map.func_map_list, + triples_map.mappings_type)] break else: new_list += [tm.TriplesMap(triples_map.triples_map_id + "_" + str(i), triples_map.data_source, subject_map, [po], triples_map.reference_formulation, - triples_map.iterator, triples_map.tablename, triples_map.query)] + triples_map.iterator, triples_map.tablename, triples_map.query, + triples_map.function,triples_map.func_map_list, + triples_map.mappings_type)] i += 1 else: new_list += [triples_map] @@ -1182,20 +1272,21 @@ def mappings_expansion(triples_map_list): if str(triples_map.file_format).lower() == "csv" or triples_map.file_format == "JSONPath" or triples_map.file_format == "XPath": if triples_map.data_source == triples_map_element.data_source: if po.object_map.child[0] == po.object_map.parent[0]: - if triples_map_element.subject_map.subject_mapping_type == "template": - object_map = tm.ObjectMap("template", - triples_map_element.subject_map.value, "None", - "None", "None", - triples_map_element.subject_map.term_type, - "None", "None") - else: - object_map = tm.ObjectMap("reference", - triples_map_element.subject_map.value, "None", - "None", "None", - triples_map_element.subject_map.term_type, - "None", "None") - pom_list.append( - tm.PredicateObjectMap(po.predicate_map, object_map, po.graph)) + """if triples_map_element.subject_map.subject_mapping_type == "template": + object_map = tm.ObjectMap("template", + triples_map_element.subject_map.value, "None", + "None", "None", + triples_map_element.subject_map.term_type, + "None", "None") + else: + object_map = tm.ObjectMap("reference", + triples_map_element.subject_map.value, "None", + "None", "None", + triples_map_element.subject_map.term_type, + "None", "None") + pom_list.append( + tm.PredicateObjectMap(po.predicate_map, object_map, po.graph))""" + pom_list.append(po) else: pom_list.append(po) else: @@ -1235,7 +1326,7 @@ def mappings_expansion(triples_map_list): new_list += [ tm.TriplesMap(triples_map.triples_map_id, triples_map.data_source, triples_map.subject_map, pom_list, triples_map.reference_formulation, triples_map.iterator, triples_map.tablename, - triples_map.query)] + triples_map.query,triples_map.function,triples_map.func_map_list,triples_map.mappings_type)] return new_list @@ -1267,27 +1358,79 @@ def mapping_parser(mapping_file): sys.exit(1) if new_formulation == "yes": - mapping_query = """ + function_query = """ prefix rr: prefix rml: prefix d2rq: prefix td: prefix hctl: + prefix dcat: + prefix void: + prefix sd: SELECT DISTINCT * WHERE { + OPTIONAL { + ?function_id rml:function ?function . + OPTIONAL { + ?function_id rml:input ?input. + ?input rml:parameter ?param. + OPTIONAL { + ?input rml:inputValue ?input_value. + } + OPTIONAL { + ?input rml:inputValueMap ?input_map. + OPTIONAL {?input_map rml:reference ?param_reference.} + OPTIONAL {?input_map rml:template ?param_template.} + OPTIONAL {?input_map rml:functionExecution ?param_func.} + } + } + } + } + """ + mapping_query = """ + prefix rr: + prefix rml: + prefix d2rq: + prefix td: + prefix hctl: + prefix dcat: + prefix void: + prefix sd: + SELECT DISTINCT * + WHERE { # Subject ------------------------------------------------------------------------- + OPTIONAL{?triples_map_id a ?mappings_type} ?triples_map_id rml:logicalSource ?_source . OPTIONAL{ ?_source rml:source ?source_attr . - ?source_attr rml:root ?root . + OPTIONAL {?source_attr rml:root ?root .} ?source_attr rml:path ?data_source } OPTIONAL{ ?_source rml:source ?data_link . - ?data_link td:hasForm ?form . + ?data_link dcat:downloadURL ?url_source . + } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link void:dataDump ?url_source . + } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link dcat:url ?url_source . + ?data_link dcat:dialect ?dialect . + ?dialect dcat:delimiter ?delimiter . + } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link td:hasPropertyAffordance ?has_form . + ?has_form td:hasForm ?form . ?form hctl:hasTarget ?url_source . } + OPTIONAL{ + ?_source rml:source ?data_link . + ?data_link sd:endpoint ?url_source . + } OPTIONAL {?_source rml:referenceFormulation ?ref_form .} OPTIONAL { ?_source rml:iterator ?iterator . } OPTIONAL { ?_source rr:tableName ?tablename .} @@ -1297,20 +1440,44 @@ def mapping_parser(mapping_file): OPTIONAL {?_subject_map rml:template ?subject_template .} OPTIONAL {?_subject_map rml:reference ?subject_reference .} OPTIONAL {?_subject_map rml:constant ?subject_constant} + OPTIONAL {?_subject_map rml:quotedTriplesMap ?subject_quoted . + OPTIONAL { + ?_subject_map rml:joinCondition ?join_condition . + ?join_condition rml:child ?subject_child_value; + rml:parent ?subject_parent_value. + } + } OPTIONAL { ?_subject_map rml:class ?rdf_class . } OPTIONAL { ?_subject_map rml:termType ?termtype . } OPTIONAL { ?_subject_map rml:graph ?graph . } - OPTIONAL { ?_subject_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:constant ?graph . } - OPTIONAL { ?_subject_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:template ?graph . } + OPTIONAL { ?_subject_map rml:graphMap ?subject_graph_structure . + ?subject_graph_structure rml:constant ?graph . + OPTIONAL {?subject_graph_structure rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?subject_graph_dump.} + } + OPTIONAL { ?_subject_map rml:graphMap ?subj_graph_structure . + ?subj_graph_structure rml:template ?graph . + OPTIONAL {?subj_graph_structure rml:logicalTarget ?subj_output . + ?subj_output rml:target ?subj_dump. + ?subj_dump void:dataDump ?subject_graph_dump.} + } + OPTIONAL {?_subject_map rml:functionExecution ?subject_function . + OPTIONAL { + ?_subject_map rml:returnMap ?output_map . + ?output_map rml:constant ?subject_output . + } + } + OPTIONAL {?_subject_map rml:logicalTarget ?output. + ?output rml:target ?dump. + ?dump void:dataDump ?subject_dump. + } # Predicate ----------------------------------------------------------------------- OPTIONAL { ?triples_map_id rml:predicateObjectMap ?_predicate_object_map . OPTIONAL { - ?triples_map_id rml:predicateObjectMap ?_predicate_object_map . ?_predicate_object_map rml:predicateMap ?_predicate_map . ?_predicate_map rml:constant ?predicate_constant . } @@ -1324,13 +1491,42 @@ def mapping_parser(mapping_file): } OPTIONAL { ?_predicate_object_map rml:predicate ?predicate_constant_shortcut . - } - + } + OPTIONAL { + ?_predicate_object_map rml:predicateMap ?_predicate_map . + ?_predicate_map rml:functionExecution ?predicate_function . + OPTIONAL { + ?_predicate_map rml:returnMap ?output_map . + ?output_map rml:constant ?predicate_output . + } + } + OPTIONAL { + ?_predicate_map rml:logicalTarget ?pre_output . + ?pre_output rml:target ?pre_dump. + ?pre_dump void:dataDump ?predicate_dump. + } # Object -------------------------------------------------------------------------- + OPTIONAL { ?_predicate_object_map rml:objectMap ?_object_map . ?_object_map rml:constant ?object_constant . + OPTIONAL { ?_object_map rml:language ?language .} + OPTIONAL {?_object_map rml:languageMap ?language_map. + OPTIONAL {?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:constant ?language.} + OPTIONAL {?language_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?language_dump.} + } + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + OPTIONAL {?datatype_map rml:template ?datatype_value.} + OPTIONAL {?datatype_map rml:constant ?datatype.} + OPTIONAL {?datatype_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?datatype_dump.} + } + OPTIONAL {?_object_map rml:termType ?term .} OPTIONAL { ?_object_map rml:datatype ?object_datatype . } @@ -1350,7 +1546,19 @@ def mapping_parser(mapping_file): ?_object_map rml:reference ?object_reference . OPTIONAL { ?_object_map rml:language ?language .} OPTIONAL {?_object_map rml:languageMap ?language_map. - ?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:constant ?language.} + OPTIONAL {?language_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?language_dump.} + } + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + OPTIONAL {?datatype_map rml:template ?datatype_value.} + OPTIONAL {?datatype_map rml:constant ?object_datatype.} + OPTIONAL {?datatype_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?datatype_dump.} + } OPTIONAL {?_object_map rml:termType ?term .} OPTIONAL { ?_object_map rml:datatype ?object_datatype . @@ -1363,17 +1571,63 @@ def mapping_parser(mapping_file): ?_object_map rml:joinCondition ?join_condition . ?join_condition rml:child ?child_value; rml:parent ?parent_value. + OPTIONAL{?parent_value rml:functionExecution ?executed_parent . + ?executed_parent rml:function ?parent_function .} + OPTIONAL{?child_value rml:functionExecution ?executed_child . + ?executed_child rml:function ?child_function .} OPTIONAL {?_object_map rml:termType ?term .} } } + OPTIONAL { + ?_predicate_object_map rml:objectMap ?_object_map . + ?_object_map rml:quotedTriplesMap ?object_quoted . + OPTIONAL { + ?_object_map rml:joinCondition ?join_condition . + ?join_condition rml:child ?child_value; + rml:parent ?parent_value. + } + } OPTIONAL { ?_predicate_object_map rml:object ?object_constant_shortcut . } + OPTIONAL{ + OPTIONAL { + ?_object_map rml:datatype ?object_datatype . + } + ?_object_map rml:functionExecution ?function. + OPTIONAL { + ?_object_map rml:returnMap ?output_map . + ?output_map rml:constant ?func_output . + } + OPTIONAL { ?_object_map rml:language ?language .} + OPTIONAL {?_object_map rml:languageMap ?language_map. + OPTIONAL {?language_map rml:reference ?language_value.} + OPTIONAL {?language_map rml:constant ?language_value.} + OPTIONAL {?language_map rml:logicalTarget ?language_output . + ?language_output rml:target ?language_dump. + ?language_dump void:dataDump ?language_dump.} + } + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + OPTIONAL {?datatype_map rml:template ?datatype_value.} + OPTIONAL {?datatype_map rml:constant ?datatype_value.} + OPTIONAL {?datatype_map rml:logicalTarget ?output . + ?output rml:target ?dump. + ?dump void:dataDump ?datatype_dump.} + } + OPTIONAL {?_object_map rml:termType ?term .} + + } OPTIONAL {?_predicate_object_map rml:graph ?predicate_object_graph .} OPTIONAL { ?_predicate_object_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:constant ?predicate_object_graph . } - OPTIONAL { ?_predicate_object_map rml:graphMap ?_graph_structure . - ?_graph_structure rml:template ?predicate_object_graph . } + OPTIONAL {?_graph_structure rml:template ?predicate_object_graph .} + OPTIONAL {?_graph_structure rml:constant ?predicate_object_graph .} + OPTIONAL {?_graph_structure rml:logicalTarget ?po_graph_output . + ?po_graph_output rml:target ?po_graph_dump. + ?po_graph_dump void:dataDump ?object_graph_dump.} + } + OPTIONAL { ?_object_map rml:logicalTarget ?obj_output. + ?obj_output rml:target ?obj_dump. + ?obj_dump void:dataDump ?object_dump.} } OPTIONAL { ?_source a d2rq:Database; @@ -1391,12 +1645,14 @@ def mapping_parser(mapping_file): prefix d2rq: prefix td: prefix htv: - prefix hctl: + prefix hctl: + prefix fnml: SELECT DISTINCT * WHERE { # Subject ------------------------------------------------------------------------- - ?triples_map_id rml:logicalSource ?_source . + OPTIONAL{?triples_map_id a ?mappings_type} + ?triples_map_id rml:logicalSource ?_source . OPTIONAL{?_source rml:source ?data_source .} OPTIONAL{ ?_source rml:source ?data_link . @@ -1408,50 +1664,59 @@ def mapping_parser(mapping_file): OPTIONAL { ?_source rr:tableName ?tablename .} OPTIONAL { ?_source rml:query ?query .} - ?triples_map_id rr:subjectMap ?_subject_map . + OPTIONAL {?triples_map_id rr:subjectMap ?_subject_map .} + OPTIONAL {?triples_map_id rml:subjectMap ?_subject_map .} OPTIONAL {?_subject_map rr:template ?subject_template .} OPTIONAL {?_subject_map rml:reference ?subject_reference .} OPTIONAL {?_subject_map rr:constant ?subject_constant} + OPTIONAL {?_subject_map rml:quotedTriplesMap ?subject_quoted . + OPTIONAL { + ?_subject_map rr:joinCondition ?join_condition . + ?join_condition rr:child ?subject_child_value; + rr:parent ?subject_parent_value. + } + } OPTIONAL { ?_subject_map rr:class ?rdf_class . } OPTIONAL { ?_subject_map rr:termType ?termtype . } OPTIONAL { ?_subject_map rr:graph ?graph . } OPTIONAL { ?_subject_map rr:graphMap ?_graph_structure . ?_graph_structure rr:constant ?graph . } OPTIONAL { ?_subject_map rr:graphMap ?_graph_structure . - ?_graph_structure rr:template ?graph . } + ?_graph_structure rr:template ?graph . } + OPTIONAL {?_subject_map fnml:functionValue ?subject_function .} # Predicate ----------------------------------------------------------------------- OPTIONAL { ?triples_map_id rr:predicateObjectMap ?_predicate_object_map . - + OPTIONAL {?_predicate_object_map rr:predicateMap ?_predicate_map .} + OPTIONAL {?_predicate_object_map rml:predicateMap ?_predicate_map .} OPTIONAL { - ?triples_map_id rr:predicateObjectMap ?_predicate_object_map . - ?_predicate_object_map rr:predicateMap ?_predicate_map . ?_predicate_map rr:constant ?predicate_constant . } OPTIONAL { - ?_predicate_object_map rr:predicateMap ?_predicate_map . ?_predicate_map rr:template ?predicate_template . } OPTIONAL { - ?_predicate_object_map rr:predicateMap ?_predicate_map . ?_predicate_map rml:reference ?predicate_reference . } OPTIONAL { ?_predicate_object_map rr:predicate ?predicate_constant_shortcut . - } + } + OPTIONAL { + ?_predicate_map fnml:functionValue ?predicate_function . + } # Object -------------------------------------------------------------------------- + OPTIONAL {?_predicate_object_map rr:objectMap ?_object_map .} + OPTIONAL {?_predicate_object_map rml:objectMap ?_object_map .} OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . ?_object_map rr:constant ?object_constant . OPTIONAL { ?_object_map rr:datatype ?object_datatype . } } OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . ?_object_map rr:template ?object_template . OPTIONAL {?_object_map rr:termType ?term .} OPTIONAL {?_object_map rml:languageMap ?language_map. @@ -1461,29 +1726,47 @@ def mapping_parser(mapping_file): } } OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . ?_object_map rml:reference ?object_reference . OPTIONAL { ?_object_map rr:language ?language .} OPTIONAL {?_object_map rml:languageMap ?language_map. ?language_map rml:reference ?language_value.} + OPTIONAL {?_object_map rml:datatypeMap ?datatype_map. + ?datatype_map rml:template ?datatype_value.} OPTIONAL {?_object_map rr:termType ?term .} OPTIONAL { ?_object_map rr:datatype ?object_datatype . } } OPTIONAL { - ?_predicate_object_map rr:objectMap ?_object_map . - ?_object_map rr:parentTriplesMap ?object_parent_triples_map . - OPTIONAL { - ?_object_map rr:joinCondition ?join_condition . - ?join_condition rr:child ?child_value; - rr:parent ?parent_value. - OPTIONAL {?_object_map rr:termType ?term .} - } - } + ?_object_map rr:parentTriplesMap ?object_parent_triples_map . + OPTIONAL { + ?_object_map rr:joinCondition ?join_condition . + ?join_condition rr:child ?child_value; + rr:parent ?parent_value. + OPTIONAL{?parent_value fnml:functionValue ?parent_function.} + OPTIONAL{?child_value fnml:functionValue ?child_function.} + OPTIONAL {?_object_map rr:termType ?term .} + } + OPTIONAL { + ?_object_map rr:joinCondition ?join_condition . + ?join_condition rr:child ?child_value; + rr:parent ?parent_value; + } + } OPTIONAL { ?_predicate_object_map rr:object ?object_constant_shortcut . } + OPTIONAL { + ?_predicate_object_map rml:object ?object_constant_shortcut . + } + OPTIONAL { + ?_object_map rml:quotedTriplesMap ?object_quoted . + OPTIONAL { + ?_object_map rr:joinCondition ?join_condition . + ?join_condition rr:child ?child_value; + rr:parent ?parent_value. + } + } OPTIONAL {?_predicate_object_map rr:graph ?predicate_object_graph .} OPTIONAL { ?_predicate_object_map rr:graphMap ?_graph_structure . ?_graph_structure rr:constant ?predicate_object_graph . } @@ -1499,49 +1782,143 @@ def mapping_parser(mapping_file): } } """ - mapping_query_results = mapping_graph.query(mapping_query) triples_map_list = [] + func_map_list = [] + if new_formulation == "yes": + mapping_query_results = mapping_graph.query(function_query) + for result_triples_map in mapping_query_results: + if result_triples_map.function_id != None: + func_map_exists = False + for func_map in func_map_list: + func_map_exists = func_map_exists or ( + str(func_map.func_map_id) == str(result_triples_map.function_id)) + if not func_map_exists: + parameters = {} + if result_triples_map.param != None: + if str(result_triples_map.param) not in parameters: + if result_triples_map.input_value != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.input_value), + "type":"constant"} + elif result_triples_map.param_reference != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_reference), + "type":"reference"} + elif result_triples_map.param_template != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_template), + "type":"template"} + elif result_triples_map.param_func != None: + parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_func), + "type":"function"} + func_map = tm.FunctionMap(str(result_triples_map.function_id),str(result_triples_map.function),parameters) + func_map_list.append(func_map) + else: + for func_map in func_map_list: + if str(func_map.func_map_id) == str(result_triples_map.function_id): + if result_triples_map.param != None: + if str(result_triples_map.param) not in func_map.parameters: + if result_triples_map.input_value != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.input_value), + "type":"constant"} + elif result_triples_map.param_reference != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_reference), + "type":"reference"} + elif result_triples_map.param_template != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_template), + "type":"template"} + elif result_triples_map.param_func != None: + func_map.parameters[str(result_triples_map.param)] = { + "value":str(result_triples_map.param_func), + "type":"function"} + mapping_query_results = mapping_graph.query(mapping_query) for result_triples_map in mapping_query_results: triples_map_exists = False for triples_map in triples_map_list: triples_map_exists = triples_map_exists or ( str(triples_map.triples_map_id) == str(result_triples_map.triples_map_id)) - if not triples_map_exists: if result_triples_map.subject_template != None: if result_triples_map.rdf_class is None: reference, condition = string_separetion(str(result_triples_map.subject_template)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template", + subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template","None","None", [result_triples_map.rdf_class], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") else: reference, condition = string_separetion(str(result_triples_map.subject_template)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template", + subject_map = tm.SubjectMap(str(result_triples_map.subject_template), condition, "template","None","None", [str(result_triples_map.rdf_class)], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") elif result_triples_map.subject_reference != None: if result_triples_map.rdf_class is None: reference, condition = string_separetion(str(result_triples_map.subject_reference)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference", + subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference","None","None", [result_triples_map.rdf_class], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") else: reference, condition = string_separetion(str(result_triples_map.subject_reference)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference", + subject_map = tm.SubjectMap(str(result_triples_map.subject_reference), condition, "reference","None","None", [str(result_triples_map.rdf_class)], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") elif result_triples_map.subject_constant != None: if result_triples_map.rdf_class is None: reference, condition = string_separetion(str(result_triples_map.subject_constant)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant", + subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant","None","None", [result_triples_map.rdf_class], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") else: reference, condition = string_separetion(str(result_triples_map.subject_constant)) - subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant", + subject_map = tm.SubjectMap(str(result_triples_map.subject_constant), condition, "constant","None","None", [str(result_triples_map.rdf_class)], result_triples_map.termtype, - [result_triples_map.graph]) + [result_triples_map.graph],"None") + elif result_triples_map.subject_function != None: + func_output = "None" + if result_triples_map.subject_output != None: + if "#" in result_triples_map.subject_output: + func_output = result_triples_map.subject_output.split("#")[1] + else: + func_output = result_triples_map.subject_output.split("/")[len(result_triples_map.subject_output.split("/"))-1] + if result_triples_map.rdf_class is None: + reference, condition = string_separetion(str(result_triples_map.subject_constant)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_function), condition, "function","None","None", + [str(result_triples_map.rdf_class)], result_triples_map.termtype, + [result_triples_map.graph],func_output) + else: + reference, condition = string_separetion(str(result_triples_map.subject_constant)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_function), condition, "function","None","None","None","None", + [str(result_triples_map.rdf_class)], result_triples_map.termtype, + [result_triples_map.graph],func_output) + elif result_triples_map.subject_quoted != None: + if result_triples_map.rdf_class is None: + reference, condition = string_separetion(str(result_triples_map.subject_quoted)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_quoted), condition, "quoted triples map", + result_triples_map.subject_parent_value, result_triples_map.subject_child_value, + [result_triples_map.rdf_class], result_triples_map.termtype, + [result_triples_map.graph],"None") + else: + reference, condition = string_separetion(str(result_triples_map.subject_quoted)) + subject_map = tm.SubjectMap(str(result_triples_map.subject_quoted), condition, "quoted triples map", + result_triples_map.subject_parent_value, result_triples_map.subject_child_value, + [str(result_triples_map.rdf_class)], result_triples_map.termtype, + [result_triples_map.graph],"None") + + if new_formulation == "yes": + output_file = "" + if result_triples_map.subject_dump != None: + output_file = result_triples_map.subject_dump[7:] if result_triples_map.subject_dump[:7] == "file://" else result_triples_map.subject_dump + elif result_triples_map.subject_graph_dump != None: + output_file = result_triples_map.subject_graph_dump[7:] if result_triples_map.subject_graph_dump[:7] == "file://" else result_triples_map.subject_graph_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:"subject"} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = "subject" mapping_query_prepared = prepareQuery(mapping_query) @@ -1551,65 +1928,213 @@ def mapping_parser(mapping_file): join_predicate = {} predicate_object_maps_list = [] predicate_object_graph = {} + + function = False for result_predicate_object_map in mapping_query_prepared_results: join = True if result_predicate_object_map.predicate_constant != None: - predicate_map = tm.PredicateMap("constant", str(result_predicate_object_map.predicate_constant), "") + predicate_map = tm.PredicateMap("constant", str(result_predicate_object_map.predicate_constant), "", "None") predicate_object_graph[ str(result_predicate_object_map.predicate_constant)] = result_triples_map.predicate_object_graph elif result_predicate_object_map.predicate_constant_shortcut != None: predicate_map = tm.PredicateMap("constant shortcut", - str(result_predicate_object_map.predicate_constant_shortcut), "") + str(result_predicate_object_map.predicate_constant_shortcut), "", "None") predicate_object_graph[ str(result_predicate_object_map.predicate_constant_shortcut)] = result_triples_map.predicate_object_graph elif result_predicate_object_map.predicate_template != None: template, condition = string_separetion(str(result_predicate_object_map.predicate_template)) - predicate_map = tm.PredicateMap("template", template, condition) + predicate_map = tm.PredicateMap("template", template, condition, "None") elif result_predicate_object_map.predicate_reference != None: reference, condition = string_separetion(str(result_predicate_object_map.predicate_reference)) - predicate_map = tm.PredicateMap("reference", reference, condition) + predicate_map = tm.PredicateMap("reference", reference, condition, "None") + elif result_predicate_object_map.predicate_function != None: + func_output = "None" + if result_predicate_object_map.predicate_output != None: + if "#" in result_predicate_object_map.predicate_output: + func_output = result_predicate_object_map.predicate_output.split("#")[1] + else: + func_output = result_predicate_object_map.predicate_output.split("/")[len(result_predicate_object_map.predicate_output.split("/"))-1] + predicate_map = tm.PredicateMap("function", str(result_predicate_object_map.predicate_function),"",func_output) else: - predicate_map = tm.PredicateMap("None", "None", "None") + predicate_map = tm.PredicateMap("None", "None", "None", "None") + + if new_formulation == "yes": + if result_predicate_object_map.predicate_dump != None: + output_file = result_predicate_object_map.predicate_dump[7:] if result_predicate_object_map.predicate_dump[:7] == "file://" else result_predicate_object_map.predicate_dump + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:[predicate_map.value]} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [predicate_map.value] + else: + if predicate_map.value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(predicate_map.value) + + if "execute" in predicate_map.value: + function = True if result_predicate_object_map.object_constant != None: object_map = tm.ObjectMap("constant", str(result_predicate_object_map.object_constant), str(result_predicate_object_map.object_datatype), "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_template != None: object_map = tm.ObjectMap("template", str(result_predicate_object_map.object_template), str(result_predicate_object_map.object_datatype), "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_reference != None: object_map = tm.ObjectMap("reference", str(result_predicate_object_map.object_reference), str(result_predicate_object_map.object_datatype), "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_parent_triples_map != None: - if predicate_map.value + " " + str( - result_predicate_object_map.object_parent_triples_map) not in join_predicate: - join_predicate[ - predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { - "predicate": predicate_map, "childs": [str(result_predicate_object_map.child_value)], - "parents": [str(result_predicate_object_map.parent_value)], - "triples_map": str(result_predicate_object_map.object_parent_triples_map)} + if predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map) not in join_predicate: + if (result_predicate_object_map.child_function is None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_value)], + "parents":[str(result_predicate_object_map.parent_function)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_function)], + "parents":[str(result_predicate_object_map.parent_value)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_function)], + "parents":[str(result_predicate_object_map.parent_function)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} + else: + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)] = { + "predicate":predicate_map, + "childs":[str(result_predicate_object_map.child_value)], + "parents":[str(result_predicate_object_map.parent_value)], + "triples_map":str(result_predicate_object_map.object_parent_triples_map)} else: - join_predicate[ - predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ - "childs"].append(str(result_predicate_object_map.child_value)) - join_predicate[ - predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ - "parents"].append(str(result_predicate_object_map.parent_value)) + if (result_predicate_object_map.child_function is None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_function)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_value)) + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_function)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_value)) + elif (result_predicate_object_map.child_function is not None) and (result_predicate_object_map.parent_function is not None): + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_function)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_function)) + else: + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "childs"].append(str(result_predicate_object_map.child_value)) + join_predicate[ + predicate_map.value + " " + str(result_predicate_object_map.object_parent_triples_map)][ + "parents"].append(str(result_predicate_object_map.parent_value)) join = False + elif result_predicate_object_map.function is not None: + func_output = "None" + if result_predicate_object_map.func_output != None: + if "#" in result_predicate_object_map.func_output: + func_output = result_predicate_object_map.func_output.split("#")[1] + else: + func_output = result_predicate_object_map.func_output.split("/")[len(result_predicate_object_map.func_output.split("/"))-1] + object_map = tm.ObjectMap("reference function", str(result_predicate_object_map.function), + str(result_predicate_object_map.object_datatype), "None", "None", + result_predicate_object_map.term, result_predicate_object_map.language, + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, func_output) + elif result_predicate_object_map.object_quoted != None: + object_map = tm.ObjectMap("quoted triples map", str(result_predicate_object_map.object_quoted), + str(result_predicate_object_map.object_datatype), + [str(result_predicate_object_map.child_value)], [str(result_predicate_object_map.parent_value)], + result_predicate_object_map.term, result_predicate_object_map.language, + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") elif result_predicate_object_map.object_constant_shortcut != None: object_map = tm.ObjectMap("constant shortcut", str(result_predicate_object_map.object_constant_shortcut), "None", "None", "None", result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") else: - object_map = tm.ObjectMap("None", "None", "None", "None", "None", "None", "None", "None") + object_map = tm.ObjectMap("None", "None", "None", "None", "None", "None", "None", "None", "None", "None") + + if new_formulation == "yes": + output_file = "" + if result_predicate_object_map.object_dump != None: + output_file = result_predicate_object_map.object_dump[7:] if result_predicate_object_map.object_dump[:7] == "file://" else result_predicate_object_map.object_dump + elif result_predicate_object_map.language_dump != None: + output_file = result_predicate_object_map.language_dump[7:] if result_predicate_object_map.language_dump[:7] == "file://" else result_predicate_object_map.language_dump + elif result_predicate_object_map.datatype_dump != None: + output_file = result_predicate_object_map.datatype_dump[7:] if result_predicate_object_map.datatype_dump[:7] == "file://" else result_predicate_object_map.datatype_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:[object_map.value]} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + if result_predicate_object_map.language_dump != None: + if result_predicate_object_map.language != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value + "_" + result_predicate_object_map.language] + elif result_predicate_object_map.language_value != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value + "_" + result_predicate_object_map.language_value] + elif result_predicate_object_map.datatype_dump != None: + if result_predicate_object_map.object_datatype != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [str(object_map.value + "_" + result_predicate_object_map.object_datatype)] + elif result_predicate_object_map.datatype_value != None: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [str(object_map.value + "_" + result_predicate_object_map.datatype_value)] + else: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value] + else: + if result_predicate_object_map.language_dump != None: + if result_predicate_object_map.language != None: + if result_predicate_object_map.language_value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value + "_" + result_predicate_object_map.language) + elif result_predicate_object_map.language_value != None: + if result_predicate_object_map.language_value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value + "_" + result_predicate_object_map.language_value) + elif result_predicate_object_map.datatype_dump != None: + if result_predicate_object_map.object_datatype != None: + if str(object_map.value + "_" + result_predicate_object_map.object_datatype) not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(str(object_map.value + "_" + result_predicate_object_map.object_datatype)) + elif result_predicate_object_map.datatype_value != None: + if str(object_map.value + "_" + result_predicate_object_map.datatype_value) not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(str(object_map.value + "_" + result_predicate_object_map.datatype_value)) + else: + if object_map.value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value) + if result_predicate_object_map.object_graph_dump != None: + output_file = result_predicate_object_map.object_graph_dump[7:] if result_predicate_object_map.object_graph_dump[:7] == "file://" else result_predicate_object_map.object_graph_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:[object_map.value]} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = [object_map.value] + else: + if object_map.value not in logical_dump[str(result_triples_map.triples_map_id)][output_file]: + logical_dump[str(result_triples_map.triples_map_id)][output_file].append(object_map.value) + if join: predicate_object_maps_list += [ tm.PredicateObjectMap(predicate_map, object_map, predicate_object_graph)] @@ -1620,18 +2145,34 @@ def mapping_parser(mapping_file): str(result_predicate_object_map.object_datatype), join_predicate[jp]["childs"], join_predicate[jp]["parents"], result_predicate_object_map.term, result_predicate_object_map.language, - result_predicate_object_map.language_value) + result_predicate_object_map.language_value, + result_predicate_object_map.datatype_value, "None") predicate_object_maps_list += [ tm.PredicateObjectMap(join_predicate[jp]["predicate"], object_map, predicate_object_graph)] - if result_triples_map.url_source is not None: - current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), - str(result_triples_map.url_source), subject_map, - predicate_object_maps_list, - ref_form=str(result_triples_map.ref_form), - iterator=str(result_triples_map.iterator), - tablename=str(result_triples_map.tablename), - query=str(result_triples_map.query)) + if result_triples_map.delimiter is not None: + url_source = str(result_triples_map.url_source)[7:] if str(result_triples_map.url_source)[:7] == "file://" else str(result_triples_map.url_source) + delimiter[url_source] = str(result_triples_map.delimiter) + if ".xml" in str(result_triples_map.url_source) and str(result_triples_map.ref_form) != "http://w3id.org/rml/XPath": + current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), + str(result_triples_map.url_source), subject_map, + predicate_object_maps_list, + ref_form="http://w3id.org/rml/XPath", + iterator=str(result_triples_map.iterator), + tablename=str(result_triples_map.tablename), + query=str(result_triples_map.query), + function=function,func_map_list=func_map_list, + mappings_type=str(result_triples_map.mappings_type)) + else: + current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), + str(result_triples_map.url_source), subject_map, + predicate_object_maps_list, + ref_form=str(result_triples_map.ref_form), + iterator=str(result_triples_map.iterator), + tablename=str(result_triples_map.tablename), + query=str(result_triples_map.query), + function=function,func_map_list=func_map_list, + mappings_type=str(result_triples_map.mappings_type)) else: current_triples_map = tm.TriplesMap(str(result_triples_map.triples_map_id), str(result_triples_map.data_source), subject_map, @@ -1639,7 +2180,10 @@ def mapping_parser(mapping_file): ref_form=str(result_triples_map.ref_form), iterator=str(result_triples_map.iterator), tablename=str(result_triples_map.tablename), - query=str(result_triples_map.query)) + query=str(result_triples_map.query), + function=function,func_map_list=func_map_list, + mappings_type=str(result_triples_map.mappings_type)) + triples_map_list += [current_triples_map] else: @@ -1650,6 +2194,66 @@ def mapping_parser(mapping_file): if result_triples_map.graph not in triples_map.subject_map.graph: triples_map.graph.append(result_triples_map.graph) + if new_formulation == "yes": + output_file = "" + if result_triples_map.subject_dump != None: + output_file = result_triples_map.subject_dump[7:] if result_triples_map.subject_dump[:7] == "file://" else result_triples_map.subject_dump + elif result_triples_map.subject_graph_dump != None: + output_file = result_triples_map.subject_graph_dump[7:] if result_triples_map.subject_graph_dump[:7] == "file://" else result_triples_map.subject_graph_dump + if output_file != "": + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:"subject"} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = "subject" + + if result_triples_map.predicate_constant_shortcut != None: + for po in triples_map.predicate_object_maps_list: + if po.predicate_map.value == str(result_triples_map.predicate_constant_shortcut): + if str(result_triples_map.predicate_constant_shortcut) in po.graph: + po.graph[str(result_triples_map.predicate_constant_shortcut)] = result_triples_map.predicate_object_graph + + if new_formulation == "yes": + output_file = "" + if result_triples_map.predicate_dump != None: + if result_triples_map.predicate_constant != None: + value = result_triples_map.predicate_constant + elif result_triples_map.predicate_template != None: + value = result_triples_map.predicate_template + elif result_triples_map.predicate_reference != None: + value = result_triples_map.predicate_reference + output_file = result_triples_map.predicate_dump[7:] if result_triples_map.predicate_dump[:7] == "file://" else result_triples_map.predicate_dump + + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:value} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = value + + output_file = "" + if result_triples_map.object_dump != None: + output_file = result_triples_map.object_dump[7:] if result_triples_map.object_dump[:7] == "file://" else result_triples_map.object_dump + elif result_triples_map.object_graph_dump != None: + output_file = result_triples_map.object_graph_dump[7:] if result_triples_map.object_graph_dump[:7] == "file://" else result_triples_map.object_graph_dump + elif result_triples_map.language_dump != None: + output_file = result_triples_map.language_dump[7:] if result_triples_map.language_dump[:7] == "file://" else result_triples_map.language_dump + elif result_triples_map.datatype_dump != None: + output_file = result_triples_map.datatype_dump[7:] if result_triples_map.datatype_dump[:7] == "file://" else result_triples_map.datatype_dump + if output_file != "": + if result_triples_map.object_constant != None: + value = result_triples_map.object_constant + elif result_triples_map.object_reference != None: + value = result_triples_map.object_reference + elif result_triples_map.object_template != None: + value = result_triples_map.object_template + elif result_triples_map.object_parent_triples_map != None: + value = result_triples_map.object_parent_triples_map + if str(result_triples_map.triples_map_id) not in logical_dump: + logical_dump[str(result_triples_map.triples_map_id)] = {output_file:value} + else: + if output_file not in logical_dump[str(result_triples_map.triples_map_id)]: + logical_dump[str(result_triples_map.triples_map_id)][output_file] = value + return mappings_expansion(triples_map_list) @@ -1662,474 +2266,605 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): object_list = [] global blank_message global host, port, user, password, datab - with open(str(triples_map.data_source), "r") as input_file_descriptor: - tree = ET.parse(input_file_descriptor) + if "http" in triples_map.data_source: + response = requests.get(triples_map.data_source, stream=True) + root = ET.fromstring(response.content) + else: + tree = ET.parse(triples_map.data_source) root = tree.getroot() - if "[" not in triples_map.iterator: - level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - 1] - else: - temp = triples_map.iterator.split("[")[0] - level = temp.split("/")[len(temp.split("/")) - 1] - parent_map = {c: p for p in tree.iter() for c in p} - namespace = dict([node for _, node in ET.iterparse(str(triples_map.data_source), events=['start-ns'])]) - if namespace: - for name in namespace: - ET.register_namespace(name, namespace[name]) - if "/" in triples_map.iterator: - parent_level = 2 - while len(list(root.iterfind(level, namespace))) == 0: - if triples_map.iterator != level: - level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - parent_level] + "/" + level - parent_level += 1 - else: + if "[" not in triples_map.iterator: + level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - 1] + if level == "": + i = 1 + while i < len(triples_map.iterator.split("/")) - 1: + level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - i] + if level != "": break - else: - level = "." - if mapping_partitions == "yes": - if triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant" or \ - triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant shortcut": - predicate = "<" + triples_map.predicate_object_maps_list[0].predicate_map.value + ">" - constant_predicate = False + i += 1 + else: + temp = triples_map.iterator.split("[")[0] + level = temp.split("/")[len(temp.split("/")) - 1] + if level == "": + i = 1 + while i < len(temp.split("/")) - 1: + level = temp.split("/")[len(temp.split("/")) - i] + if level != "": + break + i += 1 + parent_map = {c: p for p in root.iter() for c in p} + if "http" in triples_map.data_source: + namespace = {} + for elem in root.iter(): + namespace_uri = elem.tag.split('}')[0][1:] + if namespace_uri and ':' in elem.tag: + prefix = elem.tag.split(':')[0] + namespace[prefix] = namespace_uri + else: + namespace = dict([node for _, node in ET.iterparse(str(triples_map.data_source), events=['start-ns'])]) + if namespace: + for name in namespace: + ET.register_namespace(name, namespace[name]) + if "/" in triples_map.iterator: + parent_level = 2 + while len(list(root.iterfind(level, namespace))) == 0: + if triples_map.iterator != level: + level = triples_map.iterator.split("/")[len(triples_map.iterator.split("/")) - parent_level] + "/" + level + parent_level += 1 else: - predicate = None - constant_predicate = True + break + else: + level = "." + if mapping_partitions == "yes": + if triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant" or \ + triples_map.predicate_object_maps_list[0].predicate_map.mapping_type == "constant shortcut": + predicate = "<" + triples_map.predicate_object_maps_list[0].predicate_map.value + ">" + constant_predicate = False else: predicate = None constant_predicate = True - for child in root.iterfind(level, namespace): - create_subject = True - global generated_subjects + else: + predicate = None + constant_predicate = True + for child in root.iterfind(level, namespace): + create_subject = True + global generated_subjects - if mapping_partitions == "yes": - if "_" in triples_map.triples_map_id: - componets = triples_map.triples_map_id.split("_")[:-1] - triples_map_id = "" - for name in componets: - triples_map_id += name + "_" - triples_map_id = triples_map_id[:-1] - else: - triples_map_id = triples_map.triples_map_id + if mapping_partitions == "yes": + if "_" in triples_map.triples_map_id: + componets = triples_map.triples_map_id.split("_")[:-1] + triples_map_id = "" + for name in componets: + triples_map_id += name + "_" + triples_map_id = triples_map_id[:-1] + else: + triples_map_id = triples_map.triples_map_id - subject_attr = extract_subject_values(child, generated_subjects[triples_map_id]["subject_attr"], "XML", - parent_map) + subject_attr = extract_subject_values(child, generated_subjects[triples_map_id]["subject_attr"], "XML", + parent_map) - if subject_attr == None: - subject = None - create_subject = False - else: - if triples_map_id in generated_subjects: - if subject_attr in generated_subjects[triples_map_id]: - subject = generated_subjects[triples_map_id][subject_attr] - create_subject = False - - if create_subject: - subject_value = string_substitution_xml(triples_map.subject_map.value, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) - if triples_map.subject_map.subject_mapping_type == "template": - if triples_map.subject_map.term_type is None: - if triples_map.subject_map.condition == "": + if subject_attr == None: + subject = None + create_subject = False + else: + if triples_map_id in generated_subjects: + if subject_attr in generated_subjects[triples_map_id]: + subject = generated_subjects[triples_map_id][subject_attr] + create_subject = False - try: - subject = "<" + subject_value + ">" - except: - subject = None + if create_subject: + subject_value = string_substitution_xml(triples_map.subject_map.value, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + if triples_map.subject_map.subject_mapping_type == "template": + if triples_map.subject_map.term_type is None: + if triples_map.subject_map.condition == "": - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - subject = "<" + subject_value + ">" - except: - subject = None - else: - if "IRI" in triples_map.subject_map.term_type: - if triples_map.subject_map.condition == "": + try: + subject = "<" + subject_value + ">" + except: + subject = None - try: - subject = "<" + base + subject_value + ">" - except: - subject = None + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "<" + subject_value + ">" + except: + subject = None + else: + if "IRI" in triples_map.subject_map.term_type: + if triples_map.subject_map.condition == "": - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - if "http" not in subject_value: + try: + subject = "<" + base + subject_value + ">" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + if "http" not in subject_value: + if base != "": subject = "<" + base + subject_value + ">" else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + if is_valid_url_syntax(subject_value): subject = "<" + subject_value + ">" - except: - subject = None - - elif "BlankNode" in triples_map.subject_map.term_type: - if triples_map.subject_map.condition == "": - - try: - if "/" in subject_value: - subject = "_:" + encode_char(subject_value.replace("/", "2F")).replace("%", "") - if blank_message: - logger.warning( - "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") - blank_message = False else: - subject = "_:" + encode_char(subject_value).replace("%", "") - if "." in subject: - subject = subject.replace(".", "2E") - except: - subject = None + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + except: + subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - subject = "_:" + subject_value - except: - subject = None + elif "BlankNode" in triples_map.subject_map.term_type: + if triples_map.subject_map.condition == "": - elif "Literal" in triples_map.subject_map.term_type: - subject = None + try: + if "/" in subject_value: + subject = "_:" + encode_char(subject_value.replace("/", "2F")).replace("%", "") + if blank_message: + logger.warning( + "Incorrect format for Blank Nodes. \"/\" will be replace with \"2F\".") + blank_message = False + else: + subject = "_:" + encode_char(subject_value).replace("%", "") + if "." in subject: + subject = subject.replace(".", "2E") + except: + subject = None else: - if triples_map.subject_map.condition == "": + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "_:" + subject_value + except: + subject = None - try: - subject = "<" + subject_value + ">" - except: - subject = None + elif "Literal" in triples_map.subject_map.term_type: + subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - subject = "<" + subject_value + ">" - except: - subject = None + else: + if triples_map.subject_map.condition == "": - elif "reference" in triples_map.subject_map.subject_mapping_type: - if triples_map.subject_map.condition == "": - subject_value = string_substitution_xml(triples_map.subject_map.value, ".+", child, "subject", - triples_map.iterator, parent_map, namespace) - subject_value = subject_value[0][1:-1] - try: - if " " not in subject_value: - if "http" not in subject_value: + try: + subject = "<" + subject_value + ">" + except: + subject = None + + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "<" + subject_value + ">" + except: + subject = None + + elif "reference" in triples_map.subject_map.subject_mapping_type: + if triples_map.subject_map.condition == "": + subject_value = string_substitution_xml(triples_map.subject_map.value, ".+", child, "subject", + triples_map.iterator, parent_map, namespace) + subject_value = subject_value[0][1:-1] + try: + if " " not in subject_value: + if "http" not in subject_value: + if base != "": subject = "<" + base + subject_value + ">" else: - subject = "<" + subject_value + ">" + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - logger.error(" is an invalid URL") - subject = None - except: + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + logger.error(" is an invalid URL") + subject = None + except: + subject = None + if triples_map.subject_map.term_type == "IRI": + if " " not in subject_value: + subject = "<" + encode_char(subject_value) + ">" + else: subject = None - if triples_map.subject_map.term_type == "IRI": - if " " not in subject_value: - subject = "<" + encode_char(subject_value) + ">" - else: - subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - if "http" not in subject_value: + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + if "http" not in subject_value: + if base != "": subject = "<" + base + subject_value + ">" else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + if is_valid_url_syntax(subject_value): subject = "<" + subject_value + ">" - except: - subject = None + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + except: + subject = None - elif "constant" in triples_map.subject_map.subject_mapping_type: - subject = "<" + triples_map.subject_map.value + ">" + elif "constant" in triples_map.subject_map.subject_mapping_type: + subject = "<" + triples_map.subject_map.value + ">" - else: - if triples_map.subject_map.condition == "": + else: + if triples_map.subject_map.condition == "": - try: - subject = "\"" + triples_map.subject_map.value + "\"" - except: - subject = None + try: + subject = "\"" + triples_map.subject_map.value + "\"" + except: + subject = None - else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - subject = "\"" + triples_map.subject_map.value + "\"" - except: - subject = None + else: + # field, condition = condition_separetor(triples_map.subject_map.condition) + # if row[field] == condition: + try: + subject = "\"" + triples_map.subject_map.value + "\"" + except: + subject = None - if mapping_partitions == "yes": - if triples_map_id in generated_subjects: - if subject_attr in generated_subjects[triples_map_id]: - pass - else: - generated_subjects[triples_map_id][subject_attr] = subject + if mapping_partitions == "yes": + if triples_map_id in generated_subjects: + if subject_attr in generated_subjects[triples_map_id]: + pass else: - generated_subjects[triples_map_id] = {subject_attr: subject} + generated_subjects[triples_map_id][subject_attr] = subject + else: + generated_subjects[triples_map_id] = {subject_attr: subject} - if triples_map.subject_map.rdf_class != [None] and subject != None: - predicate = "" - for rdf_class in triples_map.subject_map.rdf_class: - if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): - obj = "<{}>".format(rdf_class) - dictionary_table_update(subject) - dictionary_table_update(obj) - dictionary_table_update(predicate + "_" + obj) - rdf_type = subject + " " + predicate + " " + obj + ".\n" - for graph in triples_map.subject_map.graph: - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - rdf_type = rdf_type[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, - "subject", - triples_map.iterator, - parent_map, - namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if dic_table[predicate + "_" + obj] not in g_triples: - output_file_descriptor.write(rdf_type) - g_triples.update({dic_table[predicate + "_" + obj]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + obj]]: - output_file_descriptor.write(rdf_type) - g_triples[dic_table[predicate + "_" + obj]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 + if triples_map.subject_map.rdf_class != [None] and subject != None: + predicate = "" + for rdf_class in triples_map.subject_map.rdf_class: + if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): + obj = "<{}>".format(rdf_class) + dictionary_table_update(subject) + dictionary_table_update(obj) + dictionary_table_update(predicate + "_" + obj) + rdf_type = subject + " " + predicate + " " + obj + ".\n" + for graph in triples_map.subject_map.graph: + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + rdf_type = rdf_type[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, + "subject", + triples_map.iterator, + parent_map, + namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") else: + rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if duplicate == "yes": + if dic_table[predicate + "_" + obj] not in g_triples: output_file_descriptor.write(rdf_type) + g_triples.update({dic_table[predicate + "_" + obj]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + obj]]: + output_file_descriptor.write(rdf_type) + g_triples[dic_table[predicate + "_" + obj]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 - for predicate_object_map in triples_map.predicate_object_maps_list: - if constant_predicate: - if predicate_object_map.predicate_map.mapping_type == "constant" or predicate_object_map.predicate_map.mapping_type == "constant shortcut": - predicate = "<" + predicate_object_map.predicate_map.value + ">" - elif predicate_object_map.predicate_map.mapping_type == "template": - if predicate_object_map.predicate_map.condition != "": - # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) - # if row[field] == condition: - try: - predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, - "{(.+?)}", child, "predicate", - triples_map.iterator, parent_map, - namespace) + ">" - except: - predicate = None - # else: - # predicate = None - else: - try: - predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, - "{(.+?)}", child, "predicate", - triples_map.iterator, parent_map, - namespace) + ">" - except: - predicate = None - elif predicate_object_map.predicate_map.mapping_type == "reference": - if predicate_object_map.predicate_map.condition != "": - # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) - # if row[field] == condition: - predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, - "predicate", triples_map.iterator, parent_map, - namespace) - # else: - # predicate = None else: - predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, - "predicate", triples_map.iterator, parent_map, - namespace) - predicate = "<" + predicate[1:-1] + ">" + output_file_descriptor.write(rdf_type) + i += 1 + for predicate_object_map in triples_map.predicate_object_maps_list: + if constant_predicate: + if predicate_object_map.predicate_map.mapping_type == "constant" or predicate_object_map.predicate_map.mapping_type == "constant shortcut": + predicate = "<" + predicate_object_map.predicate_map.value + ">" + elif predicate_object_map.predicate_map.mapping_type == "template": + if predicate_object_map.predicate_map.condition != "": + # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) + # if row[field] == condition: + try: + predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, + "{(.+?)}", child, "predicate", + triples_map.iterator, parent_map, + namespace) + ">" + except: + predicate = None + # else: + # predicate = None else: - predicate = None - - if predicate_object_map.object_map.mapping_type == "constant" or predicate_object_map.object_map.mapping_type == "constant shortcut": - if "/" in predicate_object_map.object_map.value: - object = "<" + predicate_object_map.object_map.value + ">" + try: + predicate = "<" + string_substitution_xml(predicate_object_map.predicate_map.value, + "{(.+?)}", child, "predicate", + triples_map.iterator, parent_map, + namespace) + ">" + except: + predicate = None + elif predicate_object_map.predicate_map.mapping_type == "reference": + if predicate_object_map.predicate_map.condition != "": + # field, condition = condition_separetor(predicate_object_map.predicate_map.condition) + # if row[field] == condition: + predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, + "predicate", triples_map.iterator, parent_map, + namespace) + # else: + # predicate = None else: - object = "\"" + predicate_object_map.object_map.value + "\"" - if predicate_object_map.object_map.datatype != None: - object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) - elif predicate_object_map.object_map.mapping_type == "template": - object = string_substitution_xml(predicate_object_map.object_map.value, "{(.+?)}", child, "object", - triples_map.iterator, parent_map, namespace) - if isinstance(object, list): - for i in range(len(object)): - if predicate_object_map.object_map.term is None: - object[i] = "<" + object[i] + ">" - elif "IRI" in predicate_object_map.object_map.term: - object[i] = "<" + object[i] + ">" - else: - object[i] = "\"" + object[i] + "\"" - if predicate_object_map.object_map.datatype != None: - object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format( - predicate_object_map.object_map.datatype) - elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object[i] += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object[i] += "@en" - elif len(predicate_object_map.object_map.language) == 2: - object[i] += "@" + predicate_object_map.object_map.language - elif predicate_object_map.object_map.language_map != None: - lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", - child, "object", triples_map.iterator, parent_map, - namespace) - if lang != None: - object[i] += "@" + string_substitution_xml( - predicate_object_map.object_map.language_map, ".+", child, "object", - triples_map.iterator, parent_map, namespace)[1:-1] + predicate = string_substitution_xml(predicate_object_map.predicate_map.value, ".+", child, + "predicate", triples_map.iterator, parent_map, + namespace) + predicate = "<" + predicate[1:-1] + ">" + else: + predicate = None + + if predicate_object_map.object_map.mapping_type == "constant" or predicate_object_map.object_map.mapping_type == "constant shortcut": + if "/" in predicate_object_map.object_map.value: + object = "<" + predicate_object_map.object_map.value + ">" + else: + object = "\"" + predicate_object_map.object_map.value + "\"" + if predicate_object_map.object_map.datatype != None: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.mapping_type == "template": + object = string_substitution_xml(predicate_object_map.object_map.value, "{(.+?)}", child, "object", + triples_map.iterator, parent_map, namespace) + if isinstance(object, list): + for i in range(len(object)): if predicate_object_map.object_map.term is None: - object = "<" + object + ">" + object[i] = "<" + object[i] + ">" elif "IRI" in predicate_object_map.object_map.term: - object = "<" + object + ">" + object[i] = "<" + object[i] + ">" else: - object = "\"" + object + "\"" + object[i] = "\"" + object[i] + "\"" if predicate_object_map.object_map.datatype != None: - object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object += "@en" + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object[i] += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object[i] += "@en" elif len(predicate_object_map.object_map.language) == 2: - object += "@" + predicate_object_map.object_map.language + object[i] += "@" + predicate_object_map.object_map.language + else: + object[i] = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace) if lang != None: - object += "@" + string_substitution_xml( + object[i] += "@" + string_substitution_xml( predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace)[1:-1] - elif predicate_object_map.object_map.mapping_type == "reference": - object = string_substitution_xml(predicate_object_map.object_map.value, ".+", child, "object", - triples_map.iterator, parent_map, namespace) - if object != None: - if isinstance(object, list): - for i in range(len(object)): - if "\\" in object[i][1:-1]: - object = "\"" + object[i][1:-1].replace("\\", "\\\\") + "\"" - if "'" in object[i][1:-1]: - object = "\"" + object[i][1:-1].replace("'", "\\\\'") + "\"" - if "\"" in object[i][1:-1]: - object = "\"" + object[i][1:-1].replace("\"", "\\\"") + "\"" - if "\n" in object[i]: - object[i] = object[i].replace("\n", "\\n") - if predicate_object_map.object_map.datatype != None: - object[i] += "^^<{}>".format(predicate_object_map.object_map.datatype) - elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object[i] += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object[i] += "@en" - elif len(predicate_object_map.object_map.language) == 2: - object[i] += "@" + predicate_object_map.object_map.language - elif predicate_object_map.object_map.language_map != None: - lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", - child, "object", triples_map.iterator, parent_map, - namespace) - if lang != None: - object[i] += "@" + string_substitution_xml( - predicate_object_map.object_map.language_map, ".+", child, "object", - triples_map.iterator, parent_map, namespace)[1:-1] - elif predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - if " " not in object: - object[i] = "\"" + object[i][1:-1].replace("\\\\'", "'") + "\"" - object[i] = "<" + encode_char(object[i][1:-1]) + ">" - else: - object[i] = None - else: - if "\\" in object[1:-1]: - object = "\"" + object[1:-1].replace("\\", "\\\\") + "\"" - if "'" in object[1:-1]: - object = "\"" + object[1:-1].replace("'", "\\\\'") + "\"" - if "\"" in object[1:-1]: - object = "\"" + object[1:-1].replace("\"", "\\\"") + "\"" - if "\n" in object: - object = object.replace("\n", "\\n") + else: + if predicate_object_map.object_map.term is None: + object = "<" + object + ">" + elif "IRI" in predicate_object_map.object_map.term: + object = "<" + object + ">" + else: + object = "\"" + object + "\"" + if predicate_object_map.object_map.datatype != None: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( + predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.language != None: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object += "@" + predicate_object_map.object_map.language + else: + object = None + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", + child, "object", triples_map.iterator, parent_map, + namespace) + if lang != None: + object += "@" + string_substitution_xml( + predicate_object_map.object_map.language_map, ".+", child, "object", + triples_map.iterator, parent_map, namespace)[1:-1] + elif predicate_object_map.object_map.mapping_type == "reference": + object = string_substitution_xml(predicate_object_map.object_map.value, ".+", child, "object", + triples_map.iterator, parent_map, namespace) + if object != None: + if isinstance(object, list): + for i in range(len(object)): + if "\\" in object[i][1:-1]: + object[i] = "\"" + object[i][1:-1].replace("\\", "\\\\") + "\"" + if "'" in object[i][1:-1]: + object[i] = "\"" + object[i][1:-1].replace("'", "\\\\'") + "\"" + if "\"" in object[i][1:-1]: + object[i] = "\"" + object[i][1:-1].replace("\"", "\\\"") + "\"" + if "\n" in object[i]: + object[i] = object[i].replace("\n", "\\n") if predicate_object_map.object_map.datatype != None: - object += "^^<{}>".format(predicate_object_map.object_map.datatype) + object[i] += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: - object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: - object += "@en" + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object[i] += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object[i] += "@en" elif len(predicate_object_map.object_map.language) == 2: - object += "@" + predicate_object_map.object_map.language + object[i] += "@" + predicate_object_map.object_map.language + else: + object[i] = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace) if lang != None: - object += "@" + string_substitution_xml( + object[i] += "@" + string_substitution_xml( predicate_object_map.object_map.language_map, ".+", child, "object", triples_map.iterator, parent_map, namespace)[1:-1] elif predicate_object_map.object_map.term != None: if "IRI" in predicate_object_map.object_map.term: if " " not in object: - object = "\"" + object[1:-1].replace("\\\\'", "'") + "\"" - object = "<" + encode_char(object[1:-1]) + ">" + object[i] = "\"" + object[i][1:-1].replace("\\\\'", "'") + "\"" + object[i] = "<" + encode_char(object[i][1:-1]) + ">" else: - object = None - elif predicate_object_map.object_map.mapping_type == "parent triples map": - if subject != None: - for triples_map_element in triples_map_list: - if triples_map_element.triples_map_id == predicate_object_map.object_map.value: - if triples_map_element.data_source != triples_map.data_source: - if triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[ - 0] not in join_table: - if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": - if "http" in triples_map_element.data_source: - if triples_map_element.file_format == "JSONPath": - response = urlopen(triples_map_element.data_source) - data = json.loads(response.read()) + object[i] = None + else: + if "\\" in object[1:-1]: + object = "\"" + object[1:-1].replace("\\", "\\\\") + "\"" + if "'" in object[1:-1]: + object = "\"" + object[1:-1].replace("'", "\\\\'") + "\"" + if "\"" in object[1:-1]: + object = "\"" + object[1:-1].replace("\"", "\\\"") + "\"" + if "\n" in object: + object = object.replace("\n", "\\n") + if predicate_object_map.object_map.datatype != None: + object += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_xml(predicate_object_map.object_map.datatype_map, "{(.+?)}", child, + "object", triples_map.iterator, parent_map, namespace) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.language != None: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object += "@" + predicate_object_map.object_map.language + else: + object = None + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution_xml(predicate_object_map.object_map.language_map, ".+", + child, "object", triples_map.iterator, parent_map, + namespace) + if lang != None: + object += "@" + string_substitution_xml( + predicate_object_map.object_map.language_map, ".+", child, "object", + triples_map.iterator, parent_map, namespace)[1:-1] + elif predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + if " " not in object: + object = "\"" + object[1:-1].replace("\\\\'", "'") + "\"" + object = "<" + encode_char(object[1:-1]) + ">" + else: + object = None + elif predicate_object_map.object_map.mapping_type == "parent triples map": + if subject != None: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == predicate_object_map.object_map.value: + if triples_map_element.data_source != triples_map.data_source: + if triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[ + 0] not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + if "http" in triples_map_element.data_source: + if triples_map_element.file_format == "JSONPath": + response = urlopen(triples_map_element.data_source) + data = json.loads(response.read()) + if isinstance(data, list): + hash_maker(data, triples_map_element, + predicate_object_map.object_map,"", triples_map_list) + elif len(data) < 2: + hash_maker(data[list(data.keys())[0]], triples_map_element, + predicate_object_map.object_map,"", triples_map_list) + else: + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=",") + hash_maker(data, triples_map_element, + predicate_object_map.object_map,"", triples_map_list) + else: + data = json.load(input_file_descriptor) if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) - else: - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - if str(triples_map_element.file_format).lower() == "csv": - data = csv.DictReader(input_file_descriptor, delimiter=",") - hash_maker(data, triples_map_element, - predicate_object_map.object_map) - else: - data = json.load(input_file_descriptor) - if isinstance(data, list): - hash_maker(data, triples_map_element, - predicate_object_map.object_map) - elif len(data) < 2: - hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) - elif triples_map_element.file_format == "XPath": - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - child_tree = ET.parse(input_file_descriptor) - child_root = child_tree.getroot() - hash_maker_xml(child_root, triples_map_element, - predicate_object_map.object_map, parent_map, namespace) + elif triples_map_element.file_format == "XPath": + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + child_tree = ET.parse(input_file_descriptor) + child_root = child_tree.getroot() + hash_maker_xml(child_root, triples_map_element, + predicate_object_map.object_map, parent_map, namespace) + else: + database, query_list = translate_sql(triples_map_element) + db = connector.connect(host=host, port=int(port), user=user, + password=password) + cursor = db.cursor(buffered=True) + cursor.execute("use " + datab) + for query in query_list: + cursor.execute(query) + hash_maker_array(cursor, triples_map_element, + predicate_object_map.object_map) + + if "@" in predicate_object_map.object_map.child[0]: + child_condition = predicate_object_map.object_map.child[0].split("@")[len(predicate_object_map.object_map.child[0].split("@"))-1] + if child_condition in child.attrib: + if child.attrib[child_condition] != None: + if child.attrib[child_condition] in join_table[ + triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]]: + object_list = join_table[triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]][ + child.attrib[child_condition]] + else: + object_list = [] + else: + object_list = [] + else: + if child.find(predicate_object_map.object_map.child[0]) != None: + if child.find(predicate_object_map.object_map.child[0]).text in join_table[ + triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]]: + object_list = join_table[triples_map_element.triples_map_id + "_" + + predicate_object_map.object_map.child[0]][ + child.find(predicate_object_map.object_map.child[0]).text] else: - database, query_list = translate_sql(triples_map_element) - db = connector.connect(host=host, port=int(port), user=user, - password=password) - cursor = db.cursor(buffered=True) - cursor.execute("use " + datab) - for query in query_list: - cursor.execute(query) - hash_maker_array(cursor, triples_map_element, - predicate_object_map.object_map) + object_list = [] + object = None + else: + if predicate_object_map.object_map.parent != None: + if triples_map_element.triples_map_id + "_" + \ + predicate_object_map.object_map.child[0] not in join_table: + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + child_tree = ET.parse(input_file_descriptor) + child_root = child_tree.getroot() + hash_maker_xml(child_root, triples_map_element, + predicate_object_map.object_map, parent_map, namespace) if "@" in predicate_object_map.object_map.child[0]: child_condition = predicate_object_map.object_map.child[0].split("@")[len(predicate_object_map.object_map.child[0].split("@"))-1] @@ -2157,115 +2892,31 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): object_list = [] object = None else: - if predicate_object_map.object_map.parent != None: - if triples_map_element.triples_map_id + "_" + \ - predicate_object_map.object_map.child[0] not in join_table: - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - child_tree = ET.parse(input_file_descriptor) - child_root = child_tree.getroot() - hash_maker_xml(child_root, triples_map_element, - predicate_object_map.object_map, parent_map, namespace) - - if "@" in predicate_object_map.object_map.child[0]: - child_condition = predicate_object_map.object_map.child[0].split("@")[len(predicate_object_map.object_map.child[0].split("@"))-1] - if child_condition in child.attrib: - if child.attrib[child_condition] != None: - if child.attrib[child_condition] in join_table[ - triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]]: - object_list = join_table[triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]][ - child.attrib[child_condition]] - else: - object_list = [] - else: - object_list = [] - else: - if child.find(predicate_object_map.object_map.child[0]) != None: - if child.find(predicate_object_map.object_map.child[0]).text in join_table[ - triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]]: - object_list = join_table[triples_map_element.triples_map_id + "_" + - predicate_object_map.object_map.child[0]][ - child.find(predicate_object_map.object_map.child[0]).text] - else: - object_list = [] + try: + object = "<" + string_substitution_xml( + triples_map_element.subject_map.value, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + ">" + except TypeError: object = None - else: - try: - object = "<" + string_substitution_xml( - triples_map_element.subject_map.value, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) + ">" - except TypeError: - object = None - break - else: - continue - else: - object = None + break + else: + continue else: object = None + else: + object = None - if predicate in general_predicates: - dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) - else: - dictionary_table_update(predicate) - if predicate != None and (object != None or object) and subject != None: - for graph in triples_map.subject_map.graph: - dictionary_table_update(subject) - if isinstance(object, list): - for obj in object: - dictionary_table_update(obj) - triple = subject + " " + predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, - "subject", - triples_map.iterator, - parent_map, - namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - else: - dictionary_table_update(object) - triple = subject + " " + predicate + " " + object + ".\n" + if predicate in general_predicates: + dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) + else: + dictionary_table_update(predicate) + if predicate != None and (object != None or object) and subject != None: + for graph in triples_map.subject_map.graph: + dictionary_table_update(subject) + if isinstance(object, list): + for obj in object: + dictionary_table_update(obj) + triple = subject + " " + predicate + " " + obj + ".\n" if graph != None and "defaultGraph" not in graph: if "{" in graph: triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, @@ -2287,147 +2938,241 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]]: output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: if dic_table[predicate] not in g_triples: output_file_descriptor.write(triple) g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate]]: output_file_descriptor.write(triple) g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: output_file_descriptor.write(triple) i += 1 - if predicate[1:-1] in predicate_object_map.graph: - if isinstance(object, list): - for obj in object: - triple = subject + " " + predicate + " " + obj + ".\n" - if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ - predicate_object_map.graph[predicate[1:-1]]: - if "{" in predicate_object_map.graph[predicate[1:-1]]: - triple = triple[:-2] + " <" + string_substitution_xml( - predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], - "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - triple = triple[:-2] + " <" + predicate_object_map.graph[ - predicate[1:-1]] + ">.\n" - dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - predicate + "_" + predicate_object_map.object_map.value]: - output_file_descriptor.write(triple) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[predicate]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) + else: + dictionary_table_update(object) + triple = subject + " " + predicate + " " + object + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, + "subject", + triples_map.iterator, + parent_map, + namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 else: - triple = subject + " " + predicate + " " + object + ".\n" + output_file_descriptor.write(triple) + i += 1 + if predicate[1:-1] in predicate_object_map.graph: + if isinstance(object, list): + for obj in object: + triple = subject + " " + predicate + " " + obj + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ - predicate_object_map.graph[predicate[1:-1]]: + predicate_object_map.graph[predicate[1:-1]]: if "{" in predicate_object_map.graph[predicate[1:-1]]: triple = triple[:-2] + " <" + string_substitution_xml( predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", triples_map.iterator, parent_map, namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") else: - triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + triple = triple[:-2] + " <" + predicate_object_map.graph[ + predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ predicate + "_" + predicate_object_map.object_map.value]: output_file_descriptor.write(triple) g_triples[dic_table[ predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: if dic_table[predicate] not in g_triples: output_file_descriptor.write(triple) g_triples.update({dic_table[predicate]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate]]: output_file_descriptor.write(triple) g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) + {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: output_file_descriptor.write(triple) - elif predicate != None and subject != None and object_list: - dictionary_table_update(subject) - for obj in object_list: - dictionary_table_update(obj) - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + object + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution_xml( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + ">.\n" + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + predicate + "_" + predicate_object_map.object_map.value]: + output_file_descriptor.write(triple) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[predicate]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + elif predicate != None and subject != None and object_list: + dictionary_table_update(subject) + for obj in object_list: + dictionary_table_update(obj) + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, + "subject", + triples_map.iterator, + parent_map, + namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, + namespace) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + i += 1 + if predicate[1:-1] in predicate_object_map.graph: + triple = subject + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution_xml( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", + triples_map.iterator, parent_map, namespace) + ">.\n" + dictionary_table_update( + "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", child, "subject", triples_map.iterator, + parent_map, namespace) + ">") else: - triple = subject + " " + predicate + " " + obj + ".\n" - - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_xml(graph, "{(.+?)}", child, - "subject", - triples_map.iterator, - parent_map, - namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(graph, "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, - namespace) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: if dic_table[ @@ -2440,8 +3185,8 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]]: output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 else: @@ -2459,61 +3204,15 @@ def semantify_xml(triples_map, triples_map_list, output_file_descriptor): else: output_file_descriptor.write(triple) i += 1 - if predicate[1:-1] in predicate_object_map.graph: - triple = subject + " " + predicate + " " + obj + ".\n" - if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ - predicate_object_map.graph[predicate[1:-1]]: - if "{" in predicate_object_map.graph[predicate[1:-1]]: - triple = triple[:-2] + " <" + string_substitution_xml( - predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", child, "subject", - triples_map.iterator, parent_map, namespace) + ">.\n" - dictionary_table_update( - "<" + string_substitution_xml(predicate_object_map.graph[predicate[1:-1]], - "{(.+?)}", child, "subject", triples_map.iterator, - parent_map, namespace) + ">") - else: - triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" - dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - object_list = [] - else: - continue + object_list = [] + else: + continue return i def semantify_json(triples_map, triples_map_list, delimiter, output_file_descriptor, data, iterator): logger.info("TM: " + triples_map.triples_map_name) - + global current_logical_dump triples_map_triples = {} generated_triples = {} object_list = [] @@ -2521,6 +3220,8 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip global blank_message global host, port, user, password, datab i = 0 + if iterator == "$[*]": + iterator = "$.[*]" if iterator != "None" and iterator != "$.[*]" and iterator != "": new_iterator = "" temp_keys = iterator.split(".") @@ -2587,8 +3288,10 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip else: i += semantify_json(triples_map, triples_map_list, delimiter, output_file_descriptor, row, iterator.replace(new_iterator[:-1], "")) + elif iterator == "$.[*]": + for row in data: + i += semantify_json(triples_map, triples_map_list, delimiter, output_file_descriptor, row, "") else: - create_subject = True global generated_subjects @@ -2644,9 +3347,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -2655,9 +3367,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -2712,9 +3433,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip try: if " " not in subject_value: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: logger.error(" is an invalid URL") subject = None @@ -2730,9 +3460,18 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -2844,6 +3583,13 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip object = "\"" + predicate_object_map.object_map.value + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.mapping_type == "template": try: object = string_substitution_json(predicate_object_map.object_map.value, "{(.+?)}", data, "object", @@ -2865,6 +3611,33 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if "." in object_list[i]: object_list[i] = object_list[i].replace(".", "2E") object_list[i] = encode_char(object_list[i]) + else: + if predicate_object_map.object_map.datatype != None: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format( + predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.language != None: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: + object[i] += "@es" + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: + object[i] += "@en" + elif len(predicate_object_map.object_map.language) == 2: + object[i] += "@" + predicate_object_map.object_map.language + else: + object[i] = None + elif predicate_object_map.object_map.language_map != None: + lang = string_substitution_json(predicate_object_map.object_map.language_map, ".+", + data, "object", ignore, iterator) + if lang != None: + object[i] += "@" + string_substitution_json( + predicate_object_map.object_map.language_map, ".+", data, "object", ignore, + iterator)[1:-1] i += 1 else: if predicate_object_map.object_map.term is None: @@ -2885,13 +3658,22 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_json(predicate_object_map.object_map.language_map, ".+", data, "object", ignore, iterator) @@ -2921,13 +3703,22 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object_list[i] = "\"" + object_list[i][1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object[i] = "\"" + object[i][1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object_list[i] += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object_list[i] += "@en" elif len(predicate_object_map.object_map.language) == 2: object_list[i] += "@" + predicate_object_map.object_map.language + else: + object_list[i] = None elif predicate_object_map.object_map.language_map != None: object_list[i] += "@" + string_substitution_json( predicate_object_map.object_map.language_map, ".+", data, "object", ignore, @@ -2959,13 +3750,22 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_json(predicate_object_map.object_map.datatype_map, "{(.+?)}", data, + "object", ignore, iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_json(predicate_object_map.object_map.language_map, ".+", data, "object", ignore, iterator) @@ -2993,10 +3793,10 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip data = json.loads(response.read()) if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -3004,16 +3804,17 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip data_element = csv.DictReader(input_file_descriptor, delimiter=delimiter) hash_maker(data_element, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: data_element = json.load(input_file_descriptor) if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]": join_iterator(data_element, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, + triples_map_list) else: hash_maker(data_element[list(data_element.keys())[0]], - triples_map_element, predicate_object_map.object_map) + triples_map_element, predicate_object_map.object_map,"", triples_map_list) elif triples_map_element.file_format == "XPath": with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -3050,15 +3851,15 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip with open(str(triples_map_element.data_source), "r") as input_file_descriptor: if str(triples_map_element.file_format).lower() == "csv": data = csv.DictReader(input_file_descriptor, delimiter=delimiter) - hash_maker(data, triples_map_element, predicate_object_map.object_map) + hash_maker(data, triples_map_element, predicate_object_map.object_map,"", triples_map_list) else: parent_data = json.load(input_file_descriptor) if triples_map_element.iterator != "None": join_iterator(parent_data, triples_map_element.iterator, - triples_map_element, predicate_object_map.object_map) + triples_map_element, predicate_object_map.object_map, triples_map_list) else: hash_maker(parent_data[list(parent_data.keys())[0]], - triples_map_element, predicate_object_map.object_map) + triples_map_element, predicate_object_map.object_map,"", triples_map_list) if "." in predicate_object_map.object_map.child[0]: temp_keys = predicate_object_map.object_map.child[0].split(".") temp_data = data @@ -3208,159 +4009,63 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip parent_iterator) + ">" except TypeError: object = None - break - else: - continue - else: - object = None - else: - object = None - - if predicate in general_predicates: - dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) - else: - dictionary_table_update(predicate) - if predicate != None and object != None and subject != None: - dictionary_table_update(subject) - dictionary_table_update(object) - for graph in triples_map.subject_map.graph: - triple = subject + " " + predicate + " " + object + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", - ignore, iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, - iterator) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - if predicate[1:-1] in predicate_object_map.graph: - triple = subject + " " + predicate + " " + object + ".\n" - if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ - predicate_object_map.graph[predicate[1:-1]]: - if "{" in predicate_object_map.graph[predicate[1:-1]]: - triple = triple[:-2] + " <" + string_substitution_json( - predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", data, "subject", ignore, - iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", - data, "subject", ignore, iterator) + ">") - else: - triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" - dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - predicate + "_" + predicate_object_map.object_map.value]: - output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - elif predicate != None and subject != None and object_list: - dictionary_table_update(subject) - for obj in object_list: - dictionary_table_update(obj) - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" + break else: - triple = subject + " " + predicate + " " + obj + ".\n" + continue + else: + object = None + else: + object = None + if is_current_output_valid(triples_map.triples_map_id,predicate_object_map,current_logical_dump,logical_dump): + if predicate in general_predicates: + dictionary_table_update(predicate + "_" + predicate_object_map.object_map.value) + else: + dictionary_table_update(predicate) + if predicate != None and object != None and subject != None: + dictionary_table_update(subject) + dictionary_table_update(object) + for graph in triples_map.subject_map.graph: + triple = subject + " " + predicate + " " + object + ".\n" if graph != None and "defaultGraph" not in graph: if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, - "subject", ignore, - iterator) + ">.\n" + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", + ignore, iterator) + ">.\n" dictionary_table_update( "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, iterator) + ">") else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 + if predicate_object_map.graph[predicate[1:-1]] == None or graph != None: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - + output_file_descriptor.write(triple) + i += 1 if predicate[1:-1] in predicate_object_map.graph: - triple = subject + " " + predicate + " " + obj + ".\n" + triple = subject + " " + predicate + " " + object + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ predicate_object_map.graph[predicate[1:-1]]: if "{" in predicate_object_map.graph[predicate[1:-1]]: @@ -3368,180 +4073,279 @@ def semantify_json(triples_map, triples_map_list, delimiter, output_file_descrip predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", data, "subject", ignore, iterator) + ">.\n" dictionary_table_update( - "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], - "{(.+?)}", data, "subject", ignore, iterator) + ">") + "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", + data, "subject", ignore, iterator) + ">") else: triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) + dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + predicate + "_" + predicate_object_map.object_map.value]: output_file_descriptor.write(triple) g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 else: if dic_table[predicate] not in g_triples: output_file_descriptor.write(triple) g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: output_file_descriptor.write(triple) g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 else: output_file_descriptor.write(triple) i += 1 - object_list = [] - elif predicate != None and subject_list: - for subj in subject_list: - dictionary_table_update(subj) - type_predicate = "" - for rdf_class in triples_map.subject_map.rdf_class: - if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): - for graph in triples_map.subject_map.graph: - obj = "<{}>".format(rdf_class) - dictionary_table_update(obj) - dictionary_table_update(type_predicate + "_" + obj) - rdf_type = subj + " " + type_predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - rdf_type = rdf_type[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, - "subject", ignore, - iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, - iterator) + ">") - else: - rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if dic_table[type_predicate + "_" + obj] not in g_triples: - output_file_descriptor.write(rdf_type) - g_triples.update( - {dic_table[type_predicate + "_" + obj]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ - dic_table[type_predicate + "_" + obj]]: - output_file_descriptor.write(rdf_type) - g_triples[dic_table[type_predicate + "_" + obj]].update( - {dic_table[subj] + "_" + dic_table[obj]: ""}) - i += 1 - else: - output_file_descriptor.write(rdf_type) - i += 1 - if object != None: - dictionary_table_update(object) - triple = subj + " " + predicate + " " + object + ".\n" + elif predicate != None and subject != None and object_list: + dictionary_table_update(subject) + for obj in object_list: + dictionary_table_update(obj) for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" if graph != None and "defaultGraph" not in graph: if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", - ignore, iterator) + ">.\n" + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, + "subject", ignore, + iterator) + ">.\n" dictionary_table_update( "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, iterator) + ">") else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subj] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subj] + "_" + dic_table[object]: ""}) - i += 1 - else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: - output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subj] + "_" + dic_table[object]: ""}) - i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - elif object_list: - for obj in object_list: - dictionary_table_update(obj) - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subj + " " + predicate + " <" + obj[1:-1] + ">.\n" + if predicate_object_map.graph[predicate[1:-1]] == None or graph != None: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 else: - triple = subj + " " + predicate + " " + obj + ".\n" + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 else: - triple = subj + " " + predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, - "subject", ignore, - iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, - iterator) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") + output_file_descriptor.write(triple) + i += 1 + + if predicate[1:-1] in predicate_object_map.graph: + triple = subject + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution_json( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", data, "subject", ignore, + iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", data, "subject", ignore, iterator) + ">") + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") if duplicate == "yes": if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: output_file_descriptor.write(triple) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subj] + "_" + dic_table[obj]: ""}}) + dic_table[subject] + "_" + dic_table[obj]: ""}}) i += 1 - elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]]: output_file_descriptor.write(triple) g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + i += 1 + object_list = [] + elif predicate != None and subject_list: + for subj in subject_list: + dictionary_table_update(subj) + type_predicate = "" + for rdf_class in triples_map.subject_map.rdf_class: + if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): + for graph in triples_map.subject_map.graph: + obj = "<{}>".format(rdf_class) + dictionary_table_update(obj) + dictionary_table_update(type_predicate + "_" + obj) + rdf_type = subj + " " + type_predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + rdf_type = rdf_type[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, + "subject", ignore, + iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, + iterator) + ">") + else: + rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if duplicate == "yes": + if dic_table[type_predicate + "_" + obj] not in g_triples: + output_file_descriptor.write(rdf_type) + g_triples.update( + {dic_table[type_predicate + "_" + obj]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[type_predicate + "_" + obj]]: + output_file_descriptor.write(rdf_type) + g_triples[dic_table[type_predicate + "_" + obj]].update( {dic_table[subj] + "_" + dic_table[obj]: ""}) i += 1 else: - if dic_table[predicate] not in g_triples: - output_file_descriptor.write(triple) - g_triples.update( - {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(rdf_type) + i += 1 + if object != None: + dictionary_table_update(object) + triple = subj + " " + predicate + " " + object + ".\n" + for graph in triples_map.subject_map.graph: + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, "subject", + ignore, iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, + iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + else: + output_file_descriptor.write(triple) + i += 1 + elif object_list: + for obj in object_list: + dictionary_table_update(obj) + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subj + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subj + " " + predicate + " " + obj + ".\n" + else: + triple = subj + " " + predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution_json(graph, "{(.+?)}", data, + "subject", ignore, + iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution_json(graph, "{(.+?)}", data, "subject", ignore, + iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + output_file_descriptor.write(triple) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + output_file_descriptor.write(triple) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[dic_table[predicate]]: + output_file_descriptor.write(triple) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + else: output_file_descriptor.write(triple) - g_triples[dic_table[predicate]].update( - {dic_table[subj] + "_" + dic_table[obj]: ""}) i += 1 - else: - output_file_descriptor.write(triple) - i += 1 - else: - continue - else: - continue + else: + continue + else: + continue return i -def semantify_file(triples_map, triples_map_list, delimiter, output_file_descriptor, data): +def semantify_file(triples_map, triples_map_list, delimiter, output_file_descriptor, data, no_inner_cycle): """ (Private function, not accessible from outside this package) @@ -3569,6 +4373,7 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip """ object_list = [] + subject_list = [] triples_string = "" end_turtle = "" i = 0 @@ -3645,9 +4450,18 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -3656,9 +4470,18 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip # if row[field] == condition: try: if "http" not in subject_value: - subject = subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -3711,27 +4534,98 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip subject_value = subject_value[1:-1] if triples_map.subject_map.condition == "": if " " not in subject_value: - if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if "BlankNode" in triples_map.subject_map.term_type: + subject = "_:" + subject_value else: - subject = "<" + subject_value + ">" + if "http" not in subject_value: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" + else: + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: subject = None else: - # field, condition = condition_separetor(triples_map.subject_map.condition) - # if row[field] == condition: - try: - if "http" not in subject_value: - subject = "<" + base + subject_value + ">" - else: - subject = "<" + subject_value + ">" - except: - subject = None + subject = None elif "constant" in triples_map.subject_map.subject_mapping_type: subject = "<" + triples_map.subject_map.value + ">" + elif "function" in triples_map.subject_map.subject_mapping_type: + subject = None + if new_formulation == "no": + temp_dics = [] + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == triples_map.subject_map.value: + dic = create_dictionary(triples_map_element) + current_func = {"output_name":"OUTPUT", + "inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic} + for inputs in dic["inputs"]: + temp_dic = {} + if "reference function" in inputs: + temp_dic = {"inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic, + "id":triples_map_element.triples_map_id} + if inner_function_exists(temp_dic, temp_dics): + temp_dics.append(temp_dic) + if temp_dics: + func = inner_function(row,current_func,triples_map_list) + subject = "<" + encode_char(func) + ">" + else: + func = execute_function(row,current_func) + subject = "<" + encode_char(func) + ">" + else: + func = None + for func_map in triples_map.func_map_list: + if func_map.func_map_id == triples_map.subject_map.value: + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + inner_func = False + for param in func_map.parameters: + if "function" in func_map.parameters[param]["type"]: + inner_func = True + if inner_func: + func = new_inner_function(row,triples_map.subject_map.value,triples_map) + else: + func = execute_function(row,None,current_func) + if triples_map.subject_map.func_result != None and func != None: + func = func[triples_map.subject_map.func_result] + if func != None: + if "http://" in func or "https://" in func: + subject = "<" + func + ">" + else: + subject = "<" + encode_char(func) + ">" + else: + subject = None + elif "quoted triples map" in triples_map.subject_map.subject_mapping_type: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == triples_map.subject_map.value: + if triples_map_element.data_source != triples_map.data_source: + if triples_map.subject_map.parent != None: + if ("quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child) not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + with open(str(triples_map_element.data_source), "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=',') + hash_maker(data, triples_map_element, triples_map.subject_map, "quoted", triples_map_list) + else: + pass + if row[triples_map.subject_map.child] in join_table["quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child]: + subject_list = join_table["quoted_" + triples_map_element.triples_map_id + "_" + triples_map.subject_map.child][row[triples_map.subject_map.child]] + else: + subject_list = inner_semantify_file(triples_map_element, triples_map_list, delimiter, row, base) + subject = None else: if triples_map.subject_map.condition == "": @@ -3760,7 +4654,7 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if triples_map.subject_map.rdf_class != [None] and subject != None: predicate = "" for rdf_class in triples_map.subject_map.rdf_class: - if rdf_class != None and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): + if rdf_class != None and rdf_class != "None" and ("str" == type(rdf_class).__name__ or "URIRef" == type(rdf_class).__name__): obj = "<{}>".format(rdf_class) rdf_type = subject + " " + predicate + " " + obj + ".\n" for graph in triples_map.subject_map.graph: @@ -3775,35 +4669,36 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: rdf_type = rdf_type[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - dictionary_table_update(subject) - dictionary_table_update(obj) - dictionary_table_update(predicate + "_" + obj) - if dic_table[predicate + "_" + obj] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(rdf_type) + if no_inner_cycle: + if duplicate == "yes": + dictionary_table_update(subject) + dictionary_table_update(obj) + dictionary_table_update(predicate + "_" + obj) + if dic_table[predicate + "_" + obj] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(rdf_type) + else: + output_file_descriptor.write(subject + " a " + determine_prefix(obj)) + g_triples.update( + {dic_table[predicate + "_" + obj]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + obj]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(rdf_type) + else: + output_file_descriptor.write(subject + " a " + determine_prefix(obj)) + g_triples[dic_table[predicate + "_" + obj]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 else: - output_file_descriptor.write(subject + " a " + determine_prefix(obj)) - g_triples.update( - {dic_table[predicate + "_" + obj]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + obj]]: + duplicate_type = True + else: if output_format.lower() == "n-triples": output_file_descriptor.write(rdf_type) else: output_file_descriptor.write(subject + " a " + determine_prefix(obj)) - g_triples[dic_table[predicate + "_" + obj]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 - else: - duplicate_type = True - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(rdf_type) - else: - output_file_descriptor.write(subject + " a " + determine_prefix(obj)) - i += 1 if output_format.lower() == "turtle" and len(triples_map.predicate_object_maps_list) == 0: output_file_descriptor.write(".\n") @@ -3844,6 +4739,29 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip predicate = string_substitution(predicate_object_map.predicate_map.value, ".+", row, "predicate", ignore, triples_map.iterator) predicate = "<" + predicate[1:-1] + ">" + elif predicate_object_map.predicate_map.mapping_type == "function": + if new_formulation == "yes": + func = None + for func_map in triples_map.func_map_list: + if func_map.func_map_id == predicate_object_map.predicate_map.value: + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + inner_func = False + for param in func_map.parameters: + if "function" in func_map.parameters[param]["type"]: + inner_func = True + if inner_func: + func = new_inner_function(row,predicate_object_map.predicate_map.value,triples_map) + else: + func = execute_function(row,None,current_func) + if predicate_object_map.predicate_map.func_result != None and func != None: + func = func[predicate_object_map.predicate_map.func_result] + if None != func: + predicate = "<" + func + ">" + else: + predicate = None + else: + predicate = None else: predicate = None @@ -3858,6 +4776,14 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: object = "\"" + object[1:-1] + "\"" + "^^{}".format( determine_prefix(predicate_object_map.object_map.datatype)) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + "object", ignore, triples_map.iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) + elif predicate_object_map.object_map.mapping_type == "template": try: if predicate_object_map.object_map.term is None: @@ -3883,13 +4809,22 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format( predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + "object", ignore, triples_map.iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object", ignore, triples_map.iterator) @@ -3917,13 +4852,22 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: object = "\"" + object[1:-1] + "\"" + "^^{}".format( determine_prefix(predicate_object_map.object_map.datatype)) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + "object", ignore, triples_map.iterator) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution(predicate_object_map.object_map.language_map, ".+", row, "object", ignore, triples_map.iterator) @@ -3937,6 +4881,11 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip object = "<" + encode_char(object[1:-1]) + ">" else: object = None + elif "BlankNode" in predicate_object_map.object_map.term: + if " " not in object: + object = "_:" + object[1:-1] + else: + object = None elif predicate_object_map.object_map.mapping_type == "parent triples map": if subject != None: for triples_map_element in triples_map_list: @@ -3954,22 +4903,22 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]" and triples_map_element.iterator != "[*]": join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -3980,30 +4929,30 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip reader = reader.drop_duplicates(keep='first') data = reader.to_dict(orient='records') hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: data = json.load(input_file_descriptor) if triples_map_element.iterator: if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]" and triples_map_element.iterator != "[*]": join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif triples_map_element.file_format == "XPath": with open(str(triples_map_element.data_source), @@ -4046,23 +4995,24 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if triples_map_element.iterator != "None" and triples_map_element.iterator != "$.[*]" and triples_map_element.iterator != "[*]": join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, + triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -4084,23 +5034,24 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip join_iterator(data, triples_map_element.iterator, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map, + triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) else: if isinstance(data, list): hash_maker(data, triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif len(data) < 2: hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) if child_list_value(predicate_object_map.object_map.child, row) in \ join_table[triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]]: @@ -4177,44 +5128,35 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip object = None else: if predicate_object_map.object_map.parent != None: - if predicate_object_map.object_map.parent[0] != \ - predicate_object_map.object_map.child[0]: - if (triples_map_element.triples_map_id + "_" + child_list( - predicate_object_map.object_map.child)) not in join_table: - with open(str(triples_map_element.data_source), - "r") as input_file_descriptor: - if str(triples_map_element.file_format).lower() == "csv": - parent_data = csv.DictReader(input_file_descriptor, - delimiter=delimiter) + if (triples_map_element.triples_map_id + "_" + child_list( + predicate_object_map.object_map.child)) not in join_table: + with open(str(triples_map_element.data_source), + "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + parent_data = csv.DictReader(input_file_descriptor, + delimiter=delimiter) + hash_maker_list(parent_data, triples_map_element, + predicate_object_map.object_map) + else: + parent_data = json.load(input_file_descriptor) + if isinstance(parent_data, list): hash_maker_list(parent_data, triples_map_element, predicate_object_map.object_map) else: - parent_data = json.load(input_file_descriptor) - if isinstance(parent_data, list): - hash_maker_list(parent_data, triples_map_element, - predicate_object_map.object_map) - else: - hash_maker_list(parent_data[list(parent_data.keys())[0]], - triples_map_element, - predicate_object_map.object_map) - if sublist(predicate_object_map.object_map.child, row.keys()): - if child_list_value(predicate_object_map.object_map.child, row) in \ - join_table[triples_map_element.triples_map_id + "_" + child_list( - predicate_object_map.object_map.child)]: - object_list = join_table[ - triples_map_element.triples_map_id + "_" + child_list( - predicate_object_map.object_map.child)][ - child_list_value(predicate_object_map.object_map.child, row)] - else: - object_list = [] - object = None - else: - try: - object = "<" + string_substitution(triples_map_element.subject_map.value, - "{(.+?)}", row, "object", ignore, - triples_map.iterator) + ">" - except TypeError: - object = None + hash_maker_list(parent_data[list(parent_data.keys())[0]], + triples_map_element, + predicate_object_map.object_map) + if sublist(predicate_object_map.object_map.child, row.keys()): + if child_list_value(predicate_object_map.object_map.child, row) in \ + join_table[triples_map_element.triples_map_id + "_" + child_list( + predicate_object_map.object_map.child)]: + object_list = join_table[ + triples_map_element.triples_map_id + "_" + child_list( + predicate_object_map.object_map.child)][ + child_list_value(predicate_object_map.object_map.child, row)] + else: + object_list = [] + object = None else: try: object = "<" + string_substitution(triples_map_element.subject_map.value, @@ -4227,6 +5169,94 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip continue else: object = None + elif predicate_object_map.object_map.mapping_type == "reference function": + object = None + if new_formulation == "no": + temp_dics = [] + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == predicate_object_map.object_map.value: + dic = create_dictionary(triples_map_element) + current_func = {"inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic} + for inputs in dic["inputs"]: + temp_dic = {} + if "reference function" in inputs: + temp_dic = {"inputs":dic["inputs"], + "function":dic["executes"], + "func_par":dic, + "id":triples_map_element.triples_map_id} + if inner_function_exists(temp_dic, temp_dics): + temp_dics.append(temp_dic) + if temp_dics: + func = inner_function(row,current_func,triples_map_list) + if predicate_object_map.object_map.term is not None: + if "IRI" in predicate_object_map.object_map.term: + object = "<" + encode_char(func) + ">" + else: + if "" != func: + object = "\"" + func + "\"" + else: + object = None + else: + if predicate_object_map.object_map.term is not None: + func = execute_function(row,None,current_func) + if "IRI" in predicate_object_map.object_map.term: + object = "<" + encode_char(func) + ">" + else: + func = execute_function(row,None,current_func) + if "" != func: + object = "\"" + func + "\"" + else: + object = None + else: + func = None + for func_map in triples_map.func_map_list: + if func_map.func_map_id == predicate_object_map.object_map.value: + current_func = {"inputs":func_map.parameters, + "function":func_map.name} + inner_func = False + for param in func_map.parameters: + if "function" in func_map.parameters[param]["type"]: + inner_func = True + if inner_func: + func = new_inner_function(row,predicate_object_map.object_map.value,triples_map) + else: + func = execute_function(row,None,current_func) + if predicate_object_map.object_map.func_result != None and func != None: + func = func[predicate_object_map.object_map.func_result] + if predicate_object_map.object_map.term is not None: + if func != None: + if "IRI" in predicate_object_map.object_map.term: + if "http://" in func.lower() or "https://" in func.lower(): + object = "<" + func + ">" + else: + object = "<" + encode_char(func) + ">" + else: + object = None + else: + if None != func: + object = "\"" + func + "\"" + else: + object = None + elif "quoted triples map" in predicate_object_map.object_map.mapping_type: + for triples_map_element in triples_map_list: + if triples_map_element.triples_map_id == predicate_object_map.object_map.value: + if triples_map_element.data_source != triples_map.data_source: + if predicate_object_map.object_map.parent != None: + if ("quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]) not in join_table: + if str(triples_map_element.file_format).lower() == "csv" or triples_map_element.file_format == "JSONPath": + with open(str(triples_map_element.data_source), "r") as input_file_descriptor: + if str(triples_map_element.file_format).lower() == "csv": + data = csv.DictReader(input_file_descriptor, delimiter=',') + hash_maker(data, triples_map_element, predicate_object_map.object_map, "quoted", triples_map_list) + else: + pass + if row[predicate_object_map.object_map.child[0]] in join_table["quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]]: + object_list = join_table["quoted_" + triples_map_element.triples_map_id + "_" + predicate_object_map.object_map.child[0]][row[predicate_object_map.object_map.child[0]]] + else: + object_list = inner_semantify_file(triples_map_element, triples_map_list, delimiter, row, base) + object = None else: object = None @@ -4348,66 +5378,67 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - - if duplicate == "yes": - dictionary_table_update(subject) - dictionary_table_update(object) - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - generated += 1 - else: - if dic_table[predicate] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subject) + dictionary_table_update(object) + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 generated += 1 - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - i += 1 - generated += 1 if predicate[1:-1] in predicate_object_map.graph: triple = subject + " " + predicate + " " + object + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ @@ -4422,116 +5453,32 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - predicate + "_" + predicate_object_map.object_map.value]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - generated += 1 - else: - if dic_table[predicate] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate]]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - generated += 1 - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - i += 1 - generated += 1 - elif predicate != None and subject != None and object_list: - for obj in object_list: - if obj != None: - for graph in triples_map.subject_map.graph: - if predicate_object_map.object_map.term != None: - if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" - else: - triple = subject + " " + predicate + " " + obj + ".\n" - if graph != None and "defaultGraph" not in graph: - if "{" in graph: - triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", - ignore, - triples_map.iterator) + ">.\n" - dictionary_table_update( - "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, - triples_map.iterator) + ">") - else: - triple = triple[:-2] + " <" + graph + ">.\n" - dictionary_table_update("<" + graph + ">") + if no_inner_cycle: if duplicate == "yes": - dictionary_table_update(subject) - dictionary_table_update(obj) if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) g_triples.update({dic_table[ predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) + dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + predicate + "_" + predicate_object_map.object_map.value]: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) g_triples[ dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + {dic_table[subject] + "_" + dic_table[object]: ""}) i += 1 generated += 1 else: @@ -4539,44 +5486,309 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, predicate_object_map, triples_map, output_file_descriptor, generated) g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) i += 1 generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ dic_table[predicate]]: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, triples_map, - output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) + end_turtle = turtle_print(subject, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + i += 1 + generated += 1 + elif predicate != None and subject != None and object_list: + for obj in object_list: + if obj != None: + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + if "quoted triples map" in predicate_object_map.object_map.mapping_type: + triple = subject + " " + predicate + " <<" + obj + ">>.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", + ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subject) + dictionary_table_update(obj) + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) + i += 1 + generated += 1 + if predicate[1:-1] in predicate_object_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + else: + triple = subject + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") + if no_inner_cycle: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subject] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, + output_file_descriptor, generated) + i += 1 + generated += 1 + object_list = [] + elif predicate != None and subject_list and object != None: + dictionary_table_update(object) + for subj in subject_list: + if subj != None: + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = "<<" + subj + ">> " + predicate + " <" + object[1:-1] + ">.\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + object + ".\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + object + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", + ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subj) + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, + generated) i += 1 generated += 1 - - else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, - predicate_object_map, triples_map, output_file_descriptor, - generated) - i += 1 - generated += 1 - if predicate[1:-1] in predicate_object_map.graph: if predicate_object_map.object_map.term != None: if "IRI" in predicate_object_map.object_map.term: - triple = subject + " " + predicate + " <" + obj[1:-1] + ">.\n" + triple = subj + " " + predicate + " <" + object[1:-1] + ">.\n" else: - triple = subject + " " + predicate + " " + obj + ".\n" + triple = subj + " " + predicate + " " + object + ".\n" else: - triple = subject + " " + predicate + " " + obj + ".\n" + triple = subj + " " + predicate + " " + object + ".\n" if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ predicate_object_map.graph[predicate[1:-1]]: if "{" in predicate_object_map.graph[predicate[1:-1]]: @@ -4590,72 +5802,254 @@ def semantify_file(triples_map, triples_map_list, delimiter, output_file_descrip else: triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[ - predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, - generated) - g_triples.update({dic_table[ - predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) - else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, - generated) - g_triples[dic_table[ - predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) - i += 1 - generated += 1 + if no_inner_cycle: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[object]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, object, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[object]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) else: - if dic_table[predicate] not in g_triples: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) + end_turtle = turtle_print(subj, predicate, object, object_list, duplicate_type, + predicate_object_map, triples_map, + output_file_descriptor, generated) + i += 1 + generated += 1 + subject_list = [] + elif predicate != None and subject_list and object_list: + for subj in subject_list: + for obj in object_list: + if obj != None and subj != None: + for graph in triples_map.subject_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = "<<" + subj + ">> " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + obj + ".\n" + else: + if "quoted triples map" in predicate_object_map.object_map.mapping_type: + triple = "<<" + subj + ">> " + predicate + " <<" + obj + ">>.\n" + else: + triple = "<<" + subj + ">> " + predicate + " " + obj + ".\n" + if graph != None and "defaultGraph" not in graph: + if "{" in graph: + triple = triple[:-2] + " <" + string_substitution(graph, "{(.+?)}", row, "subject", + ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(graph, "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") + else: + triple = triple[:-2] + " <" + graph + ">.\n" + dictionary_table_update("<" + graph + ">") + if no_inner_cycle: + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + dictionary_table_update(subj) + dictionary_table_update(obj) + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, - generated) - g_triples.update( - {dic_table[predicate]: {dic_table[subject] + "_" + dic_table[obj]: ""}}) - i += 1 - generated += 1 - elif dic_table[subject] + "_" + dic_table[obj] not in g_triples[ - dic_table[predicate]]: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, triples_map, + output_file_descriptor, generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + + else: if output_format.lower() == "n-triples": output_file_descriptor.write(triple) else: - end_turtle = turtle_print(subject, predicate, obj, object_list, - duplicate_type, predicate_object_map, - triples_map, output_file_descriptor, + end_turtle = turtle_print(subj, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, output_file_descriptor, generated) - g_triples[dic_table[predicate]].update( - {dic_table[subject] + "_" + dic_table[obj]: ""}) i += 1 generated += 1 + if predicate[1:-1] in predicate_object_map.graph: + if predicate_object_map.object_map.term != None: + if "IRI" in predicate_object_map.object_map.term: + triple = subj + " " + predicate + " <" + obj[1:-1] + ">.\n" + else: + triple = subj + " " + predicate + " " + obj + ".\n" else: - if output_format.lower() == "n-triples": - output_file_descriptor.write(triple) + triple = subj + " " + predicate + " " + obj + ".\n" + if predicate_object_map.graph[predicate[1:-1]] != None and "defaultGraph" not in \ + predicate_object_map.graph[predicate[1:-1]]: + if "{" in predicate_object_map.graph[predicate[1:-1]]: + triple = triple[:-2] + " <" + string_substitution( + predicate_object_map.graph[predicate[1:-1]], "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">.\n" + dictionary_table_update( + "<" + string_substitution(predicate_object_map.graph[predicate[1:-1]], + "{(.+?)}", row, "subject", ignore, + triples_map.iterator) + ">") else: - end_turtle = turtle_print(subject, predicate, obj, object_list, duplicate_type, - predicate_object_map, triples_map, - output_file_descriptor, generated) - i += 1 - generated += 1 + triple = triple[:-2] + " <" + predicate_object_map.graph[predicate[1:-1]] + ">.\n" + dictionary_table_update("<" + predicate_object_map.graph[predicate[1:-1]] + ">") + if no_inner_cycle: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[ + predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update({dic_table[ + predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[ + predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if dic_table[predicate] not in g_triples: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples.update( + {dic_table[predicate]: {dic_table[subj] + "_" + dic_table[obj]: ""}}) + i += 1 + generated += 1 + elif dic_table[subj] + "_" + dic_table[obj] not in g_triples[ + dic_table[predicate]]: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, + duplicate_type, predicate_object_map, + triples_map, output_file_descriptor, + generated) + g_triples[dic_table[predicate]].update( + {dic_table[subj] + "_" + dic_table[obj]: ""}) + i += 1 + generated += 1 + else: + if output_format.lower() == "n-triples": + output_file_descriptor.write(triple) + else: + end_turtle = turtle_print(subj, predicate, obj, object_list, duplicate_type, + predicate_object_map, triples_map, + output_file_descriptor, generated) + i += 1 + generated += 1 object_list = [] + subject_list = [] else: continue return i @@ -4742,9 +6136,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -4753,9 +6156,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -4811,9 +6223,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file subject_value = subject_value[1:-1] if " " not in subject_value: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: logger.error(" is an invalid URL") subject = None @@ -4830,9 +6251,18 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -4930,6 +6360,13 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file object = "\"" + predicate_object_map.object_map.value + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + if predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_array(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.mapping_type == "template": try: if predicate_object_map.object_map.term is None: @@ -4954,13 +6391,22 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file row_headers, "object", ignore) + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_array(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_array(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -4983,13 +6429,22 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file object = object.replace("\n", "\\n") if predicate_object_map.object_map.datatype != None: object += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_array(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_array(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -5028,7 +6483,7 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file with open(str(triples_map_element.data_source), "r") as input_file_descriptor: if str(triples_map_element.file_format).lower() == "csv": data = csv.DictReader(input_file_descriptor, delimiter=",") - hash_maker(data, triples_map_element, predicate_object_map.object_map) + hash_maker(data, triples_map_element, predicate_object_map.object_map,"", triples_map_list) else: data = json.load(input_file_descriptor) if isinstance(data, list): @@ -5234,45 +6689,46 @@ def semantify_mysql(row, row_headers, triples_map, triples_map_list, output_file else: triple = triple[:-2] + " <" + graph + ">.\n" dictionary_table_update("<" + graph + ">") - if duplicate == "yes": - if predicate in general_predicates: - if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { - dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ - dic_table[predicate + "_" + predicate_object_map.object_map.value]]: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( - {dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 + if predicate[1:-1] not in predicate_object_map.graph or graph != None or triples_map.subject_map.graph == [None]: + if duplicate == "yes": + if predicate in general_predicates: + if dic_table[predicate + "_" + predicate_object_map.object_map.value] not in g_triples: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples.update({dic_table[predicate + "_" + predicate_object_map.object_map.value]: { + dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[ + dic_table[predicate + "_" + predicate_object_map.object_map.value]]: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples[dic_table[predicate + "_" + predicate_object_map.object_map.value]].update( + {dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 + else: + if dic_table[predicate] not in g_triples: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples.update({dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) + i += 1 + elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) + g_triples[dic_table[predicate]].update({dic_table[subject] + "_" + dic_table[object]: ""}) + i += 1 else: - if dic_table[predicate] not in g_triples: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples.update({dic_table[predicate]: {dic_table[subject] + "_" + dic_table[object]: ""}}) - i += 1 - elif dic_table[subject] + "_" + dic_table[object] not in g_triples[dic_table[predicate]]: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) - g_triples[dic_table[predicate]].update({dic_table[subject] + "_" + dic_table[object]: ""}) - i += 1 - else: - try: - output_file_descriptor.write(triple) - except: - output_file_descriptor.write(triple.encode("utf-8")) + try: + output_file_descriptor.write(triple) + except: + output_file_descriptor.write(triple.encode("utf-8")) i += 1 if predicate[1:-1] in predicate_object_map.graph: triple = subject + " " + predicate + " " + object + ".\n" @@ -5533,9 +6989,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -5544,9 +7009,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -5605,9 +7079,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f try: if " " not in subject_value: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + subject_value + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: logger.error(" is an invalid URL") subject = None @@ -5628,9 +7111,18 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f # if row[field] == condition: try: if "http" not in subject_value: - subject = "<" + base + subject_value + ">" + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" else: - subject = "<" + encode_char(subject_value) + ">" + if is_valid_url_syntax(subject_value): + subject = "<" + subject_value + ">" + else: + if base != "": + subject = "<" + base + subject_value + ">" + else: + subject = "<" + "http://example.com/base/" + encode_char(subject_value) + ">" except: subject = None @@ -5742,6 +7234,13 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f object = "\"" + predicate_object_map.object_map.value + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_postgres(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.mapping_type == "template": try: if predicate_object_map.object_map.term is None: @@ -5766,13 +7265,22 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f row_headers, "object", ignore) + "\"" if predicate_object_map.object_map.datatype != None: object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_postgres(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_postgres(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -5795,13 +7303,22 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f object = object.replace("\n", "\\n") if predicate_object_map.object_map.datatype != None: object += "^^<{}>".format(predicate_object_map.object_map.datatype) + elif predicate_object_map.object_map.datatype_map != None: + datatype_value = string_substitution_postgres(predicate_object_map.object_map.datatype_map, "{(.+?)}", row, + row_headers, "object", ignore) + if "http" in datatype_value: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format(datatype_value) + else: + object = "\"" + object[1:-1] + "\"" + "^^<{}>".format("http://example.com/base/" + datatype_value) elif predicate_object_map.object_map.language != None: - if "spanish" in predicate_object_map.object_map.language or "es" in predicate_object_map.object_map.language: + if "spanish" == predicate_object_map.object_map.language or "es" == predicate_object_map.object_map.language: object += "@es" - elif "english" in predicate_object_map.object_map.language or "en" in predicate_object_map.object_map.language: + elif "english" == predicate_object_map.object_map.language or "en" == predicate_object_map.object_map.language: object += "@en" elif len(predicate_object_map.object_map.language) == 2: object += "@" + predicate_object_map.object_map.language + else: + object = None elif predicate_object_map.object_map.language_map != None: lang = string_substitution_postgres(predicate_object_map.object_map.language_map, ".+", row, row_headers, "object", ignore) @@ -5836,11 +7353,11 @@ def semantify_postgres(row, row_headers, triples_map, triples_map_list, output_f with open(str(triples_map_element.data_source), "r") as input_file_descriptor: if str(triples_map_element.file_format).lower() == "csv": data = csv.DictReader(input_file_descriptor, delimiter=",") - hash_maker(data, triples_map_element, predicate_object_map.object_map) + hash_maker(data, triples_map_element, predicate_object_map.object_map,"", triples_map_list) else: data = json.load(input_file_descriptor) hash_maker(data[list(data.keys())[0]], triples_map_element, - predicate_object_map.object_map) + predicate_object_map.object_map,"", triples_map_list) elif triples_map_element.file_format == "XPath": with open(str(triples_map_element.data_source), "r") as input_file_descriptor: @@ -6209,6 +7726,8 @@ def semantify(config_path, log_path='error.log'): global blank_message global generated_subjects global user, password, port, host, datab + global current_logical_dump + global g_triples start = time.time() if config["datasets"]["all_in_one_file"] == "no": @@ -6242,15 +7761,9 @@ def semantify(config_path, log_path='error.log'): for source_type in order_list: if source_type == "csv": for source in order_list[source_type]: - if enrichment == "yes": - if ".csv" in source: - reader = pd.read_csv(source, dtype=str, encoding="latin-1") - else: - reader = pd.read_csv(source, dtype=str, sep='\t', encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6259,13 +7772,91 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6274,7 +7865,7 @@ def semantify(config_path, log_path='error.log'): generated_subjects = release_subjects( sorted_sources[source_type][source][triples_map], generated_subjects) - else: + elif "endpoint:" in source: for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6283,26 +7874,304 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: - with open(source, "r", encoding="latin-1") as input_file_descriptor: - if ".csv" in source: - data = csv.DictReader(input_file_descriptor, delimiter=',') - else: - data = csv.DictReader(input_file_descriptor, delimiter='\t') - blank_message = True - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if enrichment == "yes": + if ".csv" in source: + if source in delimiter: + reader = pd.read_csv(source, dtype=str, sep=delimiter[source], encoding="latin-1") + else: + reader = pd.read_csv(source, dtype=str, encoding="latin-1") + else: + reader = pd.read_csv(source, dtype=str, sep='\t', encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + with open(source, "r", encoding="latin-1") as input_file_descriptor: + if ".csv" in source: + if source in delimiter: + data = csv.DictReader(input_file_descriptor, delimiter=delimiter[source]) + else: + data = csv.DictReader(input_file_descriptor, delimiter=',') + else: + data = csv.DictReader(input_file_descriptor, delimiter='\t') + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in order_list[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6315,12 +8184,105 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load(open(source)) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6346,6 +8308,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6362,16 +8390,9 @@ def semantify(config_path, log_path='error.log'): for source_type in sorted_sources: if source_type == "csv": for source in sorted_sources[source_type]: - if enrichment == "yes": - if ".csv" in source: - reader = pd.read_csv(source, dtype=str, encoding="latin-1") # latin-1 - else: - reader = pd.read_csv(source, dtype=str, sep="\t", header=0, - encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6380,13 +8401,91 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6395,7 +8494,7 @@ def semantify(config_path, log_path='error.log'): generated_subjects = release_subjects( sorted_sources[source_type][source][triples_map], generated_subjects) - else: + elif "endpoint:" in source: for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6404,26 +8503,305 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: - blank_message = True - with open(source, "r", encoding="latin-1") as input_file_descriptor: - if ".csv" in source: - data = csv.DictReader(input_file_descriptor, delimiter=',') - else: - data = csv.DictReader(input_file_descriptor, delimiter='\t') - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if enrichment == "yes": + if ".csv" in source: + if source in delimiter: + reader = pd.read_csv(source, dtype=str, sep=delimiter[source], encoding="latin-1") + else: + reader = pd.read_csv(source, dtype=str, encoding="latin-1") # latin-1 + else: + reader = pd.read_csv(source, dtype=str, sep="\t", header=0, + encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + for triples_map in sorted_sources[source_type][source]: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + for triples_map in sorted_sources[source_type][source]: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + with open(source, "r", encoding="latin-1") as input_file_descriptor: + if ".csv" in source: + if source in delimiter: + data = csv.DictReader(input_file_descriptor, delimiter=delimiter[source]) + else: + data = csv.DictReader(input_file_descriptor, delimiter=',') + else: + data = csv.DictReader(input_file_descriptor, delimiter='\t') + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in sorted_sources[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6436,13 +8814,106 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load(open( sorted_sources[source_type][source][triples_map].data_source)) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6468,6 +8939,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6524,8 +9061,87 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: - for row in cursor: - if config[dataset_i]["db"].lower() != "none": + if config[dataset_i]["db"].lower() != "none": + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in cursor: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6541,7 +9157,87 @@ def semantify(config_path, log_path='error.log'): "password"], config[dataset_i]["db"], predicate).result() - else: + data.append(row) + else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in cursor: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6556,10 +9252,88 @@ def semantify(config_path, log_path='error.log'): config[dataset_i][ "password"], "None", predicate).result() - data.append(row) + data.append(row) else: - for row in data: - if config[dataset_i]["db"].lower() != "none": + if config[dataset_i]["db"].lower() != "none": + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + with open(dump_output, "w") as logical_output_descriptor: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + for row in data: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in data: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6575,7 +9349,85 @@ def semantify(config_path, log_path='error.log'): "password"], config[dataset_i]["db"], predicate).result() - else: + else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + for row in data: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + for row in data: number_triple += executor.submit(semantify_mysql, row, row_headers, sorted_sources[ @@ -6633,6 +9485,81 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + cursor.execute(source) + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in cursor: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -6647,6 +9574,80 @@ def semantify(config_path, log_path='error.log'): predicate).result() data.append(row) else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + for row in data: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in data: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -6700,12 +9701,9 @@ def semantify(config_path, log_path='error.log'): for source_type in order_list: if source_type == "csv": for source in order_list[source_type]: - if enrichment == "yes": - reader = pd.read_csv(source, encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6714,13 +9712,91 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6729,7 +9805,7 @@ def semantify(config_path, log_path='error.log'): generated_subjects = release_subjects( sorted_sources[source_type][source][triples_map], generated_subjects) - else: + elif "endpoint:" in source: for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6738,23 +9814,292 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: - blank_message = True - with open(source, "r", encoding="latin-1") as input_file_descriptor: - data = csv.DictReader(input_file_descriptor, delimiter=',') - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + if enrichment == "yes": + reader = pd.read_csv(source, encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + with open(source, "r", encoding="latin-1") as input_file_descriptor: + data = csv.DictReader(input_file_descriptor, delimiter=',') + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in order_list[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6767,13 +10112,106 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load( sorted_sources[source_type][source][triples_map].data_source) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6799,6 +10237,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6815,12 +10319,9 @@ def semantify(config_path, log_path='error.log'): for source_type in sorted_sources: if source_type == "csv": for source in sorted_sources[source_type]: - if enrichment == "yes": - reader = pd.read_csv(source, encoding="latin-1") - reader = reader.where(pd.notnull(reader), None) - if duplicate == "yes": - reader = reader.drop_duplicates(keep='first') - data = reader.to_dict(orient='records') + if ".nt" in source: + g = rdflib.Graph() + g.parse(source, format="nt") for triples_map in sorted_sources[source_type][source]: if (len(sorted_sources[source_type][source][ triples_map].predicate_object_maps_list) > 0 and @@ -6829,13 +10330,192 @@ def semantify(config_path, log_path='error.log'): 0].predicate_map.value != "None") or \ sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: + results = g.query(sorted_sources[source_type][source][triples_map].iterator) + data = [] + for row in results: + result_dict = {} + keys = list(row.__dict__["labels"].keys()) + i = 0 + while i < len(row): + result_dict[str(keys[i])] = str(row[keys[i]]) + i += 1 + data.append(result_dict) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_file, sorted_sources[source_type][ source][triples_map], triples_map_list, ",", output_file_descriptor, - data).result() + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + elif "endpoint:" in source: + for triples_map in sorted_sources[source_type][source]: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + sparql = SPARQLWrapper(source.replace("endpoint:","")) + sparql.setQuery(sorted_sources[source_type][source][triples_map].iterator) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + data = [] + for result in results["results"]["bindings"]: + result_dict = {} + for key, value in result.items(): + result_dict[key] = value["value"] + data.append(result_dict) + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -6845,31 +10525,199 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][triples_map], generated_subjects) else: - with open(source, "r", encoding="latin-1") as input_file_descriptor: - data = csv.DictReader(input_file_descriptor, delimiter=',') + if enrichment == "yes": + reader = pd.read_csv(source, encoding="latin-1") + reader = reader.where(pd.notnull(reader), None) + if duplicate == "yes": + reader = reader.drop_duplicates(keep='first') + data = reader.to_dict(orient='records') for triples_map in sorted_sources[source_type][source]: - if (len(sorted_sources[source_type][source][ - triples_map].predicate_object_maps_list) > 0 and - sorted_sources[source_type][source][ - triples_map].predicate_object_maps_list[ - 0].predicate_map.value != "None") or \ + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and sorted_sources[source_type][source][ - triples_map].subject_map.rdf_class != [None]: - blank_message = True - number_triple += executor.submit(semantify_file, - sorted_sources[source_type][ - source][triples_map], - triples_map_list, ",", - output_file_descriptor, - data).result() - if duplicate == "yes": - predicate_list = release_PTT( - sorted_sources[source_type][source][triples_map], - predicate_list) - if mapping_partitions == "yes": - generated_subjects = release_subjects( - sorted_sources[source_type][source][triples_map], - generated_subjects) + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) + else: + with open(source, "r", encoding="latin-1") as input_file_descriptor: + data = csv.DictReader(input_file_descriptor, delimiter=',') + for triples_map in sorted_sources[source_type][source]: + if "NonAssertedTriplesMap" not in sorted_sources[source_type][source][triples_map].mappings_type: + if (len(sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list) > 0 and + sorted_sources[source_type][source][ + triples_map].predicate_object_maps_list[ + 0].predicate_map.value != "None") or \ + sorted_sources[source_type][source][ + triples_map].subject_map.rdf_class != [None]: + blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + logical_output_descriptor, + data, True).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + number_triple += executor.submit(semantify_file, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, ",", + output_file_descriptor, + data, True).result() + if duplicate == "yes": + predicate_list = release_PTT( + sorted_sources[source_type][source][triples_map], + predicate_list) + if mapping_partitions == "yes": + generated_subjects = release_subjects( + sorted_sources[source_type][source][triples_map], + generated_subjects) elif source_type == "JSONPath": for source in sorted_sources[source_type]: for triples_map in sorted_sources[source_type][source]: @@ -6882,13 +10730,106 @@ def semantify(config_path, log_path='error.log'): triples_map].subject_map.rdf_class != [None]: if "http" in sorted_sources[source_type][source][ triples_map].data_source: - response = urlopen( - sorted_sources[source_type][source][triples_map].data_source) - data = json.loads(response.read()) + file_source = sorted_sources[source_type][source][triples_map].data_source + if "#" in file_source: + file = file_source.split("#")[1] + else: + file = file_source.split("/")[len(file_source.split("/"))-1] + if "gz" in file_source or "zip" in file_source or "tar.xz" in file_source or "tar.gz" in file_source: + response = requests.get(file_source) + with open(file, "wb") as f: + f.write(response.content) + if "zip" in file_source: + with zipfile.ZipFile(file, 'r') as zip_ref: + zip_ref.extractall() + data = json.load(open(file.replace(".zip",""))) + elif "tar.xz" in file_source or "tar.gz" in file_source: + with tarfile.open(file, "r") as tar: + tar.extractall() + if "tar.xz" in file_source: + data = json.load(open(file.replace(".tar.xz",""))) + else: + data = json.load(open(file.replace(".tar.gz",""))) + elif "gz" in file_source: + with open(file, "rb") as gz_file: + with open(file.replace(".gz",""), "wb") as txt_file: + shutil.copyfileobj(gzip.GzipFile(fileobj=gz_file), txt_file) + data = json.load(open(file.replace(".gz",""))) + else: + response = urlopen(file_source) + data = json.loads(response.read()) else: data = json.load(open( sorted_sources[source_type][source][triples_map].data_source)) blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_json, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + ",", logical_output_descriptor, data, + sorted_sources[source_type][source][ + triples_map].iterator).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_json, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6914,6 +10855,72 @@ def semantify(config_path, log_path='error.log'): sorted_sources[source_type][source][ triples_map].subject_map.rdf_class != [None]: blank_message = True + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + number_triple += executor.submit(semantify_xml, + sorted_sources[source_type][source][ + triples_map], triples_map_list, + logical_output_descriptor).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) number_triple += executor.submit(semantify_xml, sorted_sources[source_type][source][ triples_map], triples_map_list, @@ -6971,6 +10978,101 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + current_logical_dump = dump_output + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + if config[dataset_i]["db"].lower() != "none": + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + else: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in cursor: if config[dataset_i]["db"].lower() != "none": number_triple += executor.submit(semantify_mysql, row, @@ -7005,38 +11107,133 @@ def semantify(config_path, log_path='error.log'): predicate).result() data.append(row) else: - for row in data: - if config[dataset_i]["db"].lower() != "none": - number_triple += executor.submit(semantify_mysql, row, - row_headers, - sorted_sources[ - source_type][source][ - triples_map], - triples_map_list, - output_file_descriptor, - config[dataset_i]["host"], - int(config[dataset_i][ - "port"]), - config[dataset_i]["user"], - config[dataset_i][ - "password"], - config[dataset_i]["db"], - predicate).result() - else: - number_triple += executor.submit(semantify_mysql, row, - row_headers, - sorted_sources[ - source_type][source][ - triples_map], - triples_map_list, - output_file_descriptor, - config[dataset_i]["host"], - int(config[dataset_i][ - "port"]), - config[dataset_i]["user"], - config[dataset_i][ - "password"], "None", - predicate).result() + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + with open(dump_output, "w") as logical_output_descriptor: + current_logical_dump = dump_output + for row in data: + if config[dataset_i]["db"].lower() != "none": + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + else: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) + else: + for row in data: + if config[dataset_i]["db"].lower() != "none": + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + output_file_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], + config[dataset_i]["db"], + predicate).result() + else: + number_triple += executor.submit(semantify_mysql, row, + row_headers, + sorted_sources[ + source_type][source][ + triples_map], + triples_map_list, + output_file_descriptor, + config[dataset_i]["host"], + int(config[dataset_i][ + "port"]), + config[dataset_i]["user"], + config[dataset_i][ + "password"], "None", + predicate).result() if duplicate == "yes": predicate_list = release_PTT( sorted_sources[source_type][source][triples_map], @@ -7081,6 +11278,81 @@ def semantify(config_path, log_path='error.log'): else: predicate = None if data == []: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in cursor: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + current_logical_dump = "" + cursor.execute(source) + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in cursor: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -7095,6 +11367,80 @@ def semantify(config_path, log_path='error.log'): predicate).result() data.append(row) else: + if sorted_sources[source_type][source][triples_map].triples_map_id in logical_dump: + for dump_output in logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]: + repeat_output = is_repeat_output(dump_output,logical_dump[sorted_sources[source_type][source][triples_map].triples_map_id]) + if repeat_output == "": + temp_generated = g_triples + g_triples = {} + current_logical_dump = dump_output + with open(dump_output, "w") as logical_output_descriptor: + for row in data: + number_triple += executor.submit(semantify_postgres, row, + row_headers, + sorted_sources[source_type][ + source][triples_map], + triples_map_list, + logical_output_descriptor, + config[dataset_i]["user"], + config[dataset_i]["password"], + config[dataset_i]["db"], + config[dataset_i]["host"], + predicate).result() + current_logical_dump = "" + g_triples = temp_generated + temp_generated = {} + if "jsonld" in dump_output: + context = extract_prefixes_from_ttl(config[dataset_i]["mapping"]) + g = rdflib.Graph() + g.parse(dump_output, format="nt") + jsonld_data = g.serialize(format="json-ld", context=context) + with open(dump_output, "w") as f: + f.write(jsonld_data) + elif "n3" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + n3_data = g.serialize(format="n3") + with open(dump_output, "w") as f: + f.write(n3_data) + elif "rdfjson" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + json_data = generate_rdfjson(g) + with open(dump_output, "w") as f: + json.dump(json_data,f) + elif "rdfxml" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + xml_data = g.serialize(format="xml") + with open(dump_output, "w") as f: + f.write(xml_data) + elif "ttl" in dump_output: + g = rdflib.Graph() + g.parse(dump_output, format="nt") + ttl_data = g.serialize(format="ttl") + with open(dump_output, "w") as f: + f.write(ttl_data) + elif "tar.gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.gz","")) + with tarfile.open(dump_output, "w:gz") as tar: + tar.add(dump_output.replace(".tar.gz",""), arcname=dump_output.replace(".tar.gz","")) + elif "tar.xz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".tar.xz","")) + with tarfile.open(dump_output, "w:xz") as tar: + tar.add(dump_output.replace(".tar.xz",""), arcname=dump_output.replace(".tar.xz","")) + elif ".gz" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".gz","")) + with open(dump_output.replace(".gz",""), 'rb') as f_in: + with gzip.open(dump_output, 'wb') as f_out: + f_out.writelines(f_in) + elif ".zip" in dump_output: + os.system("mv " + dump_output + " " + dump_output.replace(".zip","")) + zip = zipfile.ZipFile(dump_output, "w", zipfile.ZIP_DEFLATED) + zip.write(dump_output.replace(".zip",""), os.path.basename(dump_output.replace(".zip",""))) + zip.close() + else: + os.system("cp " + repeat_output + " " + dump_output) for row in data: number_triple += executor.submit(semantify_postgres, row, row_headers, @@ -7123,28 +11469,4 @@ def semantify(config_path, log_path='error.log'): duration = time.time() - start_time - logger.info("Successfully semantified all datasets in {:.3f} seconds.".format(duration)) - - -""" -According to the meeting held on 11.04.2018, semantifying json files != a top priority right -now, thus the reimplementation of following functions remain largely undocumented and unfinished. - -def json_generator(file_descriptor, iterator): - if len(iterator) != 0: - if "[*]" not in iterator[0] and iterator[0] != "$": - yield from json_generator(file_descriptor[iterator[0]], iterator[1:]) - elif "[*]" not in iterator[0] and iterator[0] == "$": - yield from json_generator(file, iterator[1:]) - elif "[*]" in iterator[0] and "$" not in iterator[0]: - file_array = file_descriptor[iterator[0].replace("[*]","")] - for array_elem in file_array: - yield from json_generator(array_elem, iterator[1:]) - elif iterator[0] == "$[*]": - for array_elem in file_descriptor: - yield from json_generator(array_elem, iterator[1:]) - else: - yield file_descriptor - - -""" + logger.info("Successfully semantified all datasets in {:.3f} seconds.".format(duration)) \ No newline at end of file