-
Notifications
You must be signed in to change notification settings - Fork 36
Querying datasets created by csv2rdf4lod
csv2rdf4lod includes an extensive amount of metadata when it converts tabular data to RDF, allowing us to query and navigate among datasets in more ways than just having the data. This page collects and organizes queries that have been useful for exploring datasets aggregated by an aggregator using csv2rdf4lod. See list of SPARQL endpoints containing datasets produced by csv2rdf4lod for some places that you can execute these queries.
- Namespace prefix handling
- Dataset composition resulting from naming by source, dataset, and version **
- Aggregating subsets of converted datasets
- Provenance
- Analyzing the explicit connectivity of disparate datasets
- conversion:links_via
Show a time line of which version of the converter was used, how many datasets were converted with it, when it was last used (results). A "fully up to date" data aggregation repository would have a single row; the more rows, the less the datasets have been kept up to date.
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX doap: <http://usefulinc.com/ns/doap#>
PREFIX pmlp: <http://inference-web.org/2.0/pml-provenance.owl#>
PREFIX pmlj: <http://inference-web.org/2.0/pml-justification.owl#>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
SELECT distinct max(?date) as ?modified ?converter ?converter_doap_revision count(?dataset) as ?count
WHERE {
GRAPH <http://logd.tw.rpi.edu/vocab/Dataset> {
?dataset a conversion:VersionedDataset;
void:dataDump ?dumpFile .
optional { ?dataset dcterms:modified ?date }
?ns pmlj:hasConclusion ?dumpFile;
pmlj:isConsequentOf [
a pmlj:InferenceStep;
pmlj:hasInferenceEngine ?converter
]
.
optional { ?converter doap:revision ?converter_doap_revision }
}
} group by ?converter ?converter_doap_revision order by desc(?modified) ?count
What datasets reference resources that are owl:sameAs
other resources (results)? Note, this queries the metadata and NOT the instance data, so we're lickity-split on execution time.
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX ov: <http://open.vocab.org/terms/>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
SELECT distinct ?layer ?col ?label ?lod_link
WHERE {
graph <http://purl.org/twc/vocab/conversion/ConversionProcess> {
?layer
conversion:conversion_process [
conversion:enhance [
ov:csvCol ?col;
conversion:label ?label;
conversion:links_via ?lod_link;
];
]
.
}
}
Is there a way to know which datasets are fully loaded in the sparql endpoint?
was there any update on this question? Is running the sparql query as below at http://logd.tw.rpi.edu/sparql return the complete list of loaded datasets?
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
SELECT ?g sum( ?triples ) as ?estimated_triples
WHERE {
GRAPH ?g {
?g void:subset ?subdataset .
?subdataset conversion:num_triples ?triples .
filter regex(?g, "data-gov")
}
}
GROUP BY ?g
bad sources: prefix dcterms: http://purl.org/dc/terms/ prefix conversion: http://purl.org/twc/vocab/conversion/
SELECT count(distinct ?organization) as ?count
WHERE {
graph <http://logd.tw.rpi.edu/vocab/Dataset> {
?dataset a conversion:Dataset;
dcterms:source ?organization .
filter(!regex(str(?organization),".*provenance_file.*"))
}
}
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?subset ?modified
WHERE {
GRAPH <http://logd.tw.rpi.edu/vocab/Dataset> {
<http://logd.tw.rpi.edu/source/data-gov/dataset/92> void:subset ?subset .
optional { ?subset dcterms:modified ?modified }
}
}order by desc(?modified)
Alvaro is using this query http://logd.tw.rpi.edu/query/logd-data-list-latest-dump-file-for-dataset.sql to obtain the latest dump for a dataset. However they appear only for some datasets (see http://logd.tw.rpi.edu/datasets)
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
prefix conversion: <http://purl.org/twc/vocab/conversion/>
SELECT distinct ?dataset ?dump_file
WHERE {
graph <http://logd.tw.rpi.edu/vocab/Dataset> {
?dataset
a conversion:Dataset;
void:subset ?version .
?version a conversion:VersionedDataset .
optional {
?version void:subset ?layer .
{
{
?layer
void:dataDump ?dump_file ;
dcterms:created ?creationtime .
}
UNION
{
?descriminator conversion:num_triples ?triples .
?layer
void:dataDump ?dump_file ;
dcterms:created ?creationtime .
}
}
}
}
}
ORDER BY DESC(?creationtime)
- http://tw.rpi.edu/wiki/tw:17_Sept_2010_SWC_LOG_notes
- http://logd.tw.rpi.edu/tutorial/exploring_logd_metadata_with_sparql
- http://logd.tw.rpi.edu/tutorial/How_to_find_datasets_using_the_LOGD_sparql_endpoint
- http://code.google.com/p/data-gov-wiki/issues/detail?id=35
(a few more sprinkled around)
Trying to get to the param files (so we can count their triples so quantify effort to create them).
Use case: find the parameters used during the conversion. (querying this is now difficult and needs to be eased)
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX pmlp: <http://inference-web.org/2.0/pml-provenance.owl#>
PREFIX pmlj: <http://inference-web.org/2.0/pml-justification.owl#>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
SELECT distinct ?conclusion
WHERE {
GRAPH <http://logd.tw.rpi.edu/vocab/Dataset> {
?versioned rdf:first ?thing .
?thing pmlj:hasConclusion ?conclusion .
?conclusion pmlp:hasFormat <http://inference-web.org/registry/FMT/RDFAbstractSyntax.owl#RDFAbstractSyntax> .
}
}
Finding all of the sources:
PREFIX pmlp: <http://inference-web.org/2.0/pml-provenance.owl#>
PREFIX irw: <http://www.ontologydesignpatterns.org/ont/web/irw.owl#>
SELECT ?url
WHERE {
GRAPH <http://logd.tw.rpi.edu/vocab/Dataset> {
?url a pmlp:Source .
optional { ?url irw:redirectsTo ?none }
filter(!bound(?none))
}
}
Finding datasets from their conversion parameters:
prefix conversion: <http://purl.org/twc/vocab/conversion/>
select ?dataset
where {
graph <http://purl.org/twc/vocab/conversion/ConversionProcess> {
?dataset conversion:conversion_process [
]
}
GRAPH <http://logd.tw.rpi.edu/vocab/Dataset> {
?dataset a []
}
#GRAPH <http://purl.org/twc/vocab/conversion/MetaDataset> {
# ?dataset a []
#}
}
Modifying LOGD's dataset listing query with negation (er.. OPTIONAL{}+!BOUND()..) shows datasets that have been converted but do not have metadata (results):
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
PREFIX catalog: <http://logd.tw.rpi.edu/source/twc-rpi-edu/dataset/dataset-catalog/vocab/enhancement/1/>
PREFIX ds92: <http://logd.tw.rpi.edu/source/data-gov/dataset/92/vocab/enhancement/1/>
SELECT DISTINCT ?dataset ?Dataset_Identifier ?Page ?Title ?Agency
?Description ?homepage SUM(?triples) as ?Number_of_Triples
WHERE {
GRAPH <http://logd.tw.rpi.edu/vocab/Dataset> {
?dataset
a conversion:Dataset;
foaf:isPrimaryTopicOf ?Page ;
dcterms:identifier ?Dataset_Identifier ;
void:subset ?version .
?version a conversion:VersionedDataset .
?version void:subset ?layer .
{
?layer conversion:num_triples ?triples .
?layer void:dataDump ?dump_file.
} UNION {
?layer void:subset ?descriminator .
?descriminator conversion:num_triples ?triples .
?layer void:dataDump ?dump_file.
}
}
OPTIONAL {
GRAPH <http://purl.org/twc/vocab/conversion/MetaDataset> {
?dataset dcterms:title ?Title .
?dataset dcterms:description ?Description.
?dataset catalog:source_agency [ rdfs:label ?Agency ]
OPTIONAL {
?dataset foaf:homepage ?homepage .
}
}
}
FILTER(!BOUND(?Title))
}
ORDER BY ?Dataset_Identifier
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX ov: <http://open.vocab.org/terms/>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
SELECT distinct *
WHERE {
GRAPH <http://logd.tw.rpi.edu/source/nitrd-gov/dataset/federal_research_and_development_budget_for_networking_and_information_technology/version/2011-Jan-27> {
?d foaf:isPrimaryTopicOf ?p .
FILTER(ISURI(?d))
optional { ?d dcterms:contributor ?c }
}
}
order by ?d
The majority of named graphs are versioned datasets. How big are they? (This does not include multi-part dasets)
prefix void: <http://rdfs.org/ns/void#>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
SELECT ?triples ?versioned
WHERE {
GRAPH ?versioned {
?abstract void:subset ?versioned .
?versioned void:subset ?layer .
?layer conversion:num_triples ?triples .
}
}order by desc(?triples)
prefix void: <http://rdfs.org/ns/void#>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
SELECT sum(?triples)
WHERE {
GRAPH ?versioned {
?abstract void:subset ?versioned .
?versioned void:subset ?layer .
?layer conversion:num_triples ?triples .
}
}
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX conversion: <http://purl.org/twc/vocab/conversion/>
# Get the dump files for the most recently modified versions of any MetaDataset
# Omit the dump files for any version of the same abstract dataset that is older than another version.
SELECT ?versioned ?modified (MIN(?d) AS ?dump)
WHERE {
{
SELECT ?abstract (MAX(?m) AS ?modified)
WHERE {
GRAPH ?g {
?abstract void:subset [ void:subset [ a conversion:LayerDataset, conversion:MetaDataset ]; dcterms:modified ?m ]
}
}
GROUP BY ?abstract
}
GRAPH ?g {
?abstract void:subset ?versioned .
?versioned void:subset [ a conversion:LayerDataset, conversion:MetaDataset ];
dcterms:modified ?modified;
void:dataDump ?d
}
}
GROUP BY ?versioned ?modified
ORDER BY ?versioned ?modified