From e09e322c885f9353df22eefd4e4389362ab3bf31 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Wed, 31 Jan 2024 06:53:25 +0100 Subject: [PATCH] New Qleverfile for UniProt Since https://github.com/ad-freiburg/qlever/pull/1184, UniProt can be build with just this Qleverfile (without the need for code changes like before). --- Qleverfiles/Qleverfile.uniprot | 68 ++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/Qleverfiles/Qleverfile.uniprot b/Qleverfiles/Qleverfile.uniprot index a49fb7ba..1d83168f 100644 --- a/Qleverfiles/Qleverfile.uniprot +++ b/Qleverfiles/Qleverfile.uniprot @@ -1,38 +1,42 @@ # Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control # -# qlever get-data # See https://github.com/ad-freiburg/qlever/wiki/Using-QLever-for-UniProt -# qlever index # takes ~85 hours and ~100 GB RAM (on an AMD Ryzen 9 5900X) -# qlever start # starts the server (takes around 2 minutes) +# qlever get-data # download RDFXML and convert to NT (around 1 TB each) +# qlever index # takes ~ 1.5 days and ~40 GB RAM (on an AMD Ryzen 9 5900X) +# qlever start # starts the server (takes a few second) +# +# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv +# Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries) +# +# Set DATE to the date of the latest release +# +# IMPORTANT: Build on SSD, disk space required: ~ 10 T. For running the server, +# the uniprot.index.???.meta files can be on HDD. + +[data] +NAME = uniprot +DATE = 2024-01-24 +DOWNLOAD_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf +GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s//" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done +RDFXML2NT_CMD = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | xz -c > nt.${DATE}/$$(basename ${RDFXML} | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.xz/') && echo 'DONE converting ${RDFXML}'"; done | parallel +GET_DATA_CMD = ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD} +INDEX_DESCRIPTION = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE} -# Indexer settings -DB = uniprot -DATE = "2022-10-12" -RDF_FILES = "ttl.${DATE}/*.ttl.xz" -EXTRACT_PREFIXES = "for F in ${RDF_FILES}; do xzcat \$F | head -1000 | \grep ^@prefix; done | sort -u > ${DB}.prefix-definitions" -CAT_FILES = "xzcat -f ${DB}.prefix-definitions ${RDF_FILES}" -WITH_TEXT_INDEX = false -PSO_AND_POS_ONLY = true -STXXL_MEMORY = 80G -SETTINGS_JSON = '{ "languages-internal": ["en"], "prefixes-external": [ "