Skip to content

Commit

Permalink
New Qleverfile for UniProt
Browse files Browse the repository at this point in the history
Since ad-freiburg/qlever#1184, UniProt can be
build with just this Qleverfile (without the need for code changes like
before).
  • Loading branch information
Hannah Bast committed Jan 31, 2024
1 parent 75ae21d commit e09e322
Showing 1 changed file with 36 additions and 32 deletions.
68 changes: 36 additions & 32 deletions Qleverfiles/Qleverfile.uniprot
Original file line number Diff line number Diff line change
@@ -1,38 +1,42 @@
# Qleverfile for UniProt, use with https://github.com/ad-freiburg/qlever-control
#
# qlever get-data # See https://github.com/ad-freiburg/qlever/wiki/Using-QLever-for-UniProt
# qlever index # takes ~85 hours and ~100 GB RAM (on an AMD Ryzen 9 5900X)
# qlever start # starts the server (takes around 2 minutes)
# qlever get-data # download RDFXML and convert to NT (around 1 TB each)
# qlever index # takes ~ 1.5 days and ~40 GB RAM (on an AMD Ryzen 9 5900X)
# qlever start # starts the server (takes a few second)
#
# Install packages: sudo apt install -y libxml2-utils parallel xz-utils pv
# Install manually: Apache Jena binaries (https://dlcdn.apache.org/jena/binaries)
#
# Set DATE to the date of the latest release
#
# IMPORTANT: Build on SSD, disk space required: ~ 10 T. For running the server,
# the uniprot.index.???.meta files can be on HDD.

[data]
NAME = uniprot
DATE = 2024-01-24
DOWNLOAD_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf
GET_RDFXML_CMD = mkdir -p rdf.${DATE} && curl -s ${DOWNLOAD_URL}/RELEASE.meta4 | sed "s/<metalink.*/<metalink>/" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" - | while read URL; do wget --no-verbose -P rdf.${DATE} $$URL 2>&1 | tee -a uniprot.download-log; done
RDFXML2NT_CMD = mkdir -p nt.${DATE} && for RDFXML in rdf.${DATE}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=nt 2> /dev/null | xz -c > nt.${DATE}/$$(basename ${RDFXML} | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/nt.xz/') && echo 'DONE converting ${RDFXML}'"; done | parallel
GET_DATA_CMD = ${GET_RDFXML_CMD} && ${RDFXML2NT_CMD}
INDEX_DESCRIPTION = Complete UniProt data from ${DOWNLOAD_URL}, version ${DATE}

# Indexer settings
DB = uniprot
DATE = "2022-10-12"
RDF_FILES = "ttl.${DATE}/*.ttl.xz"
EXTRACT_PREFIXES = "for F in ${RDF_FILES}; do xzcat \$F | head -1000 | \grep ^@prefix; done | sort -u > ${DB}.prefix-definitions"
CAT_FILES = "xzcat -f ${DB}.prefix-definitions ${RDF_FILES}"
WITH_TEXT_INDEX = false
PSO_AND_POS_ONLY = true
STXXL_MEMORY = 80G
SETTINGS_JSON = '{ "languages-internal": ["en"], "prefixes-external": [ "<http://purl.uniprot.org/uniprot/", "<http://purl.uniprot.org/uniparc/", "<http://purl.uniprot.org/uniref/", "<http://purl.uniprot.org/isoforms/", "<http://purl.uniprot.org/range/", "<http://purl.uniprot.org/position/", "<http://purl.uniprot.org/refseq/", "<http://purl.uniprot.org/embl-cds/", "<http://purl.uniprot.org/EMBL", "<http://purl.uniprot.org/PATRIC", "<http://purl.uniprot.org/SEED", "<http://purl.uniprot.org/gi", "<http://rdf.ebi.ac.uk/resource", "<http://purl.uniprot.org/SHA-384" ], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-partial-vocab": 20000000 }'
GET_DATA_CMD = 'echo "Please read https://github.com/ad-freiburg/qlever/wiki/Using-QLever-for-UniProt"'
INDEX_DESCRIPTION = "Complete UniProt data from https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf, version ${DATE}"
[index]
FILE_NAMES = nt.${data:DATE}/*.nt.xz
CAT_FILES = parallel --tmpdir . -j 4 'xzcat -f {}' ::: nt.${data:DATE}/*.nt.xz | pv -q -B 5G
STXXL_MEMORY = 60G
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 }

# Server settings
HOSTNAME = $(hostname -f)
SERVER_PORT = 7018
ACCESS_TOKEN = ${DB}_%RANDOM%
MEMORY_FOR_QUERIES = 80G
CACHE_MAX_SIZE = 50G
CACHE_MAX_SIZE_SINGLE_ENTRY = 10G
CACHE_MAX_NUM_ENTRIES = 100
[server]
PORT = 7018
ACCESS_TOKEN = ${data:NAME}_1369924040
MEMORY_FOR_QUERIES = 100G
CACHE_MAX_SIZE = 70G

# QLever binaries
QLEVER_BIN_DIR = %QLEVER_BIN_DIR%
USE_DOCKER = true
QLEVER_DOCKER_IMAGE = qlever.uniprot
QLEVER_DOCKER_CONTAINER = qlever.${DB}
[docker]
USE_DOCKER = true
IMAGE = adfreiburg/qlever

# QLever UI
QLEVERUI_PORT = 7000
QLEVERUI_DIR = qlever-ui
QLEVERUI_CONFIG = uniprot
[ui]
PORT = 7000
CONFIG = uniprot

0 comments on commit e09e322

Please sign in to comment.