Skip to content

Commit

Permalink
Update Qleverfile for PubChem
Browse files Browse the repository at this point in the history
  • Loading branch information
Hannah Bast committed Feb 18, 2024
1 parent 03d1be5 commit 445f06d
Showing 1 changed file with 36 additions and 12 deletions.
48 changes: 36 additions & 12 deletions Qleverfiles/Qleverfile.pubchem
Original file line number Diff line number Diff line change
@@ -1,15 +1,39 @@
# Qleverfile for PubChem, use with https://github.com/ad-freiburg/qlever-control
#
# qlever get-data # downloads .gz files of total size 81 GB (as of 31.07.2022)
# qlever index # takes ~20 hours and ~40 GB RAM on an AMD Ryzen 9 5900X
# qlever start # starts the server (takes around 2 minutes)
# qlever get-data # downloads .gz files of total size 114 GB; see NOTES 2, 3, 4
# qlever index # takes ~5 hours and ~20 GB RAM on an AMD Ryzen 9 5900X
# qlever start # starts the server (a few seconds)
#
# TODO: The instance on https://qlever.cs.uni-freiburg.de/pubchem also contains
# the following ontologies, which are very useful for resolving names but which
# are not yet part of what is downloaded with GET_DATA_CMD.
# IMPORTANT NOTES:
#
# NOTE 1: The SPARQL endpoint at https://qlever.cs.uni-freiburg.de/pubchem also
# contains data from the following ontologies, which are very useful for
# resolving names of IRIs like `sio:SIO_000008` or `obo:IAO_0000412`, but which
# are not part of the PubChem RDF data. For the corresponding URLs, see
# https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1200479401 .
#
# bao bfo biopax-level3 chebi cheminf cito dublin_core_terms fabio go iao ncit
# obi pr ro sio skos so uo
#
# NOTE 2: The robots.txt file from https://ftp.ncbi.nlm.nih.gov currently
# disallows downloading the PubChem RDF data using `wget --recursive` as in the
# GET_DATA_CMD below. As a workaround, you can write a simple Python script
# (using `BeautifulSoup` and `urllib.parse`) to scrape the URLs from the HTML
# pages and download the files individually. This was done for the latest
# version of https://qlever.cs.uni-freiburg.de/pubchem .
#
# NOTE 3: Many of the TTL files have generic prefix definitions in the middle
# of the file, like @prefix ns23: <http://identifiers.org/biocyc/ARACYC:> .
# See https://github.com/ad-freiburg/qlever/issues/711#issuecomment-1197113953
# This is allowed by the standard, but VERY unusual. For use with QLever,
# convert the TTL files to NT before indexing, see GET_DATA_CMD below.
#
# NOTE 4: Many of the files (TTL as well as NT) contain invalid IRIs because
# spaces and braces are not properly escaped. Here is a simple awk-based script
# to percent-encode spaces and braces in all IRIs in the NT files:
#
# for NTGZ in nt.${DATE}/*.nt.gz; do echo "zcat $NTGZ | sed 's/> />\t/1; s/> />\t/1; s/ \.\$/\t./' | awk 'BEGIN{FS=OFS=\"\t\"} {for (i = 1; i <= 3; i++) if (\$i ~ /^<.*>\$/) { gsub(/ /, \"%20\", \$i); gsub(/\[/, \"%5B\", \$i); gsub(/\]/, \"%5D\", \$i); gsub(/{/, \"%7B\", \$i); gsub(/}/, \"%7D\", \$i); } print }' | sed 's/\t/ /g' | gzip -c > nt.${DATE}.FIXED/$(basename $NTGZ)"; done > fix-nt.commands.txt
# cat fix-nt.commands.txt | parallel


[DEFAULT]
Expand All @@ -18,23 +42,23 @@ NAME = pubchem
[data]
GET_DATA_URL = https://ftp.ncbi.nlm.nih.gov/pubchem/RDF
GET_DATA_CMD = wget --recursive --exclude-directories=nbr2d,nbr3d --no-host-directories --no-clobber --cut-dirs=2 --directory-prefix=ttl ${GET_DATA_URL} && find ttl \( -name "*.ttl.gz" -o -name "*.ttl" \) | parallel 'zcat -f {} | docker run --rm -i stain/jena riot --syntax=TTL --output=NT /dev/stdin 2> pubchem.ttl2nt.stderr | gzip > nt/$$(basename -s .ttl.gz -s ttl {}).nt.gz'
INDEX_DESCRIPTION = PubChem RDF from ${GET_DATA_URL}, version 29.10.2023 (all folders except nbr2d and nbr3d)
INDEX_DESCRIPTION = PubChem RDF from ${GET_DATA_URL}, version 12.12.2023 (all folders except nbr2d and nbr3d)

[index]
FILE_NAMES = nt/*.nt.gz nt/*/*.nt.gz nt/*/*/*.nt.gz
FILE_NAMES = nt.ONTOLOGIES/*.nt.gz nt.2024-02-05/*.nt.gz
CAT_FILES = zcat ${FILE_NAMES}
WITH_TEXT_INDEX = false
STXXL_MEMORY = 10G
SETTINGS_JSON = '{ "languages-internal": [""], "prefixes-external": [ "<http://rdf.ncbi.nlm.nih.gov/pubchem/" ], "ascii-prefixes-only": true, "num-triples-per-batch": 1000000 }'

SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 }

[server]
PORT = 7023
ACCESS_TOKEN = ${NAME}_310129823
MEMORY_FOR_QUERIES = 50G
MEMORY_FOR_QUERIES = 20G
TIMEOUT = 120s

[docker]
USE_DOCKER = true
USE_DOCKER = false
QLEVER_DOCKER_IMAGE = adfreiburg/qlever

[ui]
Expand Down

0 comments on commit 445f06d

Please sign in to comment.