From 6cb7e3b29dd63ab62c71bb52e5a4a4d8044fe20c Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 17 Jun 2019 22:13:14 +0200 Subject: [PATCH 1/2] Really just checking verified commits, but some improvements too. --- README.md | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index fedbba4..8b687d0 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,31 @@ -# olddog +# OldDog The repository contains the code that accompanies the blog post [Teaching An Old Dog a new Trick](https://www.chriskamphuis.com/2019/03/06/teaching-an-old-dog-a-new-trick.html). -_olddog_ is build using maven: +## Preliminaries -` -mvn clean package appassembler:assemble -` +_olddog_ is build using `maven`: -_olddog_ takes a Lucene index as input, this can be created by for example [Anserini](https://github.com/castorini/Anserini). -For example, the Robust 04 collection can be indexed as explained on [this](https://github.com/castorini/Anserini/blob/master/docs/experiments-robust04.md) page. + mvn clean package appassembler:assemble -After creating the index, the csv files representing the tables can be created issuing the following command: +_OldDog_ takes a Lucene index as input, for example as created by the [Anserini](https://github.com/castorini/Anserini) project. +The Robust 04 collection can be indexed as explained on [this Anserini page](https://github.com/castorini/Anserini/blob/master/docs/experiments-robust04.md). -` -nohup target/appassembler/bin/nl.ru.convert.Convert -index path/to/index -docs /tmp/docs.csv -dict /tmp/dict.csv -terms /tmp/terms.csv -` +## Setup -This creates multiple files that represent the columns of the `docs`, `dict` and `terms` -tables as described in the blog post. +After creating the index, the CSV files representing the database tables can be created issuing the following command: + + nohup target/appassembler/bin/nl.ru.convert.Convert -index path/to/index -docs /tmp/docs.csv -dict /tmp/dict.csv -terms /tmp/terms.csv + +This creates multiple files that represent the columns of the `docs`, `dict` and `terms` tables as described in the blog post. The column store relational database [MonetDB](https://www.monetdb.org) can load -these files using the `copy into` [command](https://www.monetdb.org/Documentation/Cookbooks/SQLrecipes/CSV_bulk_loads). +these files using the `COPY INTO` [command](https://www.monetdb.org/Documentation/Cookbooks/SQLrecipes/CSV_bulk_loads). + +## Usage -After this it is possible issue the query described in the post: +After this final step it is possible issue the query described in the post: ```sql WITH qterms AS (SELECT termid, docid, df FROM terms @@ -41,4 +42,8 @@ WITH qterms AS (SELECT termid, docid, df FROM terms SELECT scores.docid, score FROM (SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid) AS scores JOIN docs ON scores.docid=docs.docid ORDER BY score DESC; -``` \ No newline at end of file +``` + +## Team + +[**Chris Kamphuis**](https://github.com/chriskamphuis) and [**Arjen de Vries**](https://github.com/arjenpdevries). From 179a0c5fe169985d0d1f81f2cfeb2ed564952557 Mon Sep 17 00:00:00 2001 From: chriskamphuis Date: Wed, 19 Jun 2019 14:16:41 +0200 Subject: [PATCH 2/2] do not hardcode collection stats --- src/main/python/search_collection.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/main/python/search_collection.py b/src/main/python/search_collection.py index f56d684..76bb5c2 100755 --- a/src/main/python/search_collection.py +++ b/src/main/python/search_collection.py @@ -10,13 +10,13 @@ class SearchCollection: - def getQueryTemplate(self): + def getQueryTemplate(self, collection_size, avg_doc_len): queryTemplate = """ WITH qterms AS (SELECT termid, docid, count FROM terms WHERE termid IN ({})), subscores AS (SELECT docs.collection_id, docs.id, len, term_tf.termid, - term_tf.tf, df, (log((528155-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/ - (term_tf.tf+1.2*(1-0.75+0.75*(len/188.33)))))) AS subscore + term_tf.tf, df, (log(({}-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/ + (term_tf.tf+1.2*(1-0.75+0.75*(len/{})))))) AS subscore FROM (SELECT termid, docid, count as tf FROM qterms) AS term_tf JOIN (SELECT docid FROM qterms GROUP BY docid {}) @@ -30,13 +30,20 @@ def getQueryTemplate(self): conjunctive = 'HAVING COUNT(distinct termid) = {}' if self.args.disjunctive: conjunctive = '' - queryTemplate = queryTemplate.format('{}', conjunctive) + queryTemplate = queryTemplate.format('{}', collection_size, avg_doc_len, conjunctive) return queryTemplate def search(self): topics = self.topicReader.get_topics() ofile = open(self.args.output, 'w+') print("SCORING TOPICS") + + self.cursor.execute("SELECT COUNT(*) FROM docs;") + collection_size = self.cursor.fetchone()[0] + + self.cursor.execute("SELECT ROUND(AVG(len), 2) FROM docs;") + avg_doc_len = self.cursor.fetchone()[0] + for topic in topics: query_terms = topic['title'].split(" ") ids = [] @@ -47,9 +54,9 @@ def search(self): ids.append(str(term_id[0])) term_ids = ", ".join(ids) if self.args.disjunctive: - sql_query = self.getQueryTemplate().format(term_ids) + sql_query = self.getQueryTemplate(collection_size, avg_doc_len).format(term_ids) else: - sql_query = self.getQueryTemplate().format(term_ids, len(ids)) + sql_query = self.getQueryTemplate(collection_size, avg_doc_len).format(term_ids, len(ids)) self.cursor.execute(sql_query) output = self.cursor.fetchall() for rank, row in enumerate(output):