Merge branch 'master' of http://github.com/chriskamphuis/olddog

chriskamphuis · Jun 19, 2019 · 7b569fd · 7b569fd
2 parents e0599ca + 8d148f7
commit 7b569fd
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -1,30 +1,31 @@
-# olddog
+# OldDog
 
 The repository contains the code that accompanies the blog post
 [Teaching An Old Dog a new Trick](https://www.chriskamphuis.com/2019/03/06/teaching-an-old-dog-a-new-trick.html).
 
-_olddog_ is build using maven:
+## Preliminaries
 
-`
-mvn clean package appassembler:assemble
-`
+_olddog_ is build using `maven`:
 
-_olddog_ takes a Lucene index as input, this can be created by for example [Anserini](https://github.com/castorini/Anserini). 
-For example, the Robust 04 collection can be indexed as explained on [this](https://github.com/castorini/Anserini/blob/master/docs/experiments-robust04.md) page.
+    mvn clean package appassembler:assemble
 
-After creating the index, the csv files representing the tables can be created issuing the following command:
+_OldDog_ takes a Lucene index as input, for example as created by the [Anserini](https://github.com/castorini/Anserini) project. 
+The Robust 04 collection can be indexed as explained on [this Anserini page](https://github.com/castorini/Anserini/blob/master/docs/experiments-robust04.md).
 
-`
-nohup target/appassembler/bin/nl.ru.convert.Convert -index path/to/index -docs /tmp/docs.csv -dict /tmp/dict.csv -terms /tmp/terms.csv
-`
+## Setup
 
-This creates multiple files that represent the columns of the `docs`, `dict` and `terms` 
-tables as described in the blog post. 
+After creating the index, the CSV files representing the database tables can be created issuing the following command:
+
+    nohup target/appassembler/bin/nl.ru.convert.Convert -index path/to/index -docs /tmp/docs.csv -dict /tmp/dict.csv -terms /tmp/terms.csv
+
+This creates multiple files that represent the columns of the `docs`, `dict` and `terms` tables as described in the blog post. 
 
 The column store relational database [MonetDB](https://www.monetdb.org) can load
-these files using the `copy into` [command](https://www.monetdb.org/Documentation/Cookbooks/SQLrecipes/CSV_bulk_loads).
+these files using the `COPY INTO` [command](https://www.monetdb.org/Documentation/Cookbooks/SQLrecipes/CSV_bulk_loads).
+
+## Usage
 
-After this it is possible issue the query described in the post:
+After this final step it is possible issue the query described in the post:
 
 ```sql
 WITH qterms AS (SELECT termid, docid, df FROM terms                             
@@ -41,4 +42,8 @@ WITH qterms AS (SELECT termid, docid, df FROM terms
 SELECT scores.docid, score FROM (SELECT docid, sum(subscore) AS score           
   FROM subscores GROUP BY docid) AS scores JOIN docs ON                         
   scores.docid=docs.docid ORDER BY score DESC;
-```
+```
+
+## Team
+
+[**Chris Kamphuis**](https://github.com/chriskamphuis) and [**Arjen de Vries**](https://github.com/arjenpdevries).
diff --git a/src/main/python/search_collection.py b/src/main/python/search_collection.py
@@ -10,13 +10,13 @@
 
 class SearchCollection:
 
-    def getQueryTemplate(self):
+    def getQueryTemplate(self, collection_size, avg_doc_len):
         queryTemplate =  """
             WITH qterms AS (SELECT termid, docid, count FROM terms 
                 WHERE termid IN ({})), 
                 subscores AS (SELECT docs.collection_id, docs.id, len, term_tf.termid, 
-                term_tf.tf, df, (log((528155-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/
-                (term_tf.tf+1.2*(1-0.75+0.75*(len/188.33)))))) AS subscore 
+                term_tf.tf, df, (log(({}-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/
+                (term_tf.tf+1.2*(1-0.75+0.75*(len/{})))))) AS subscore
                 FROM (SELECT termid, docid, count as tf FROM qterms) AS term_tf 
                 JOIN (SELECT docid FROM qterms
                     GROUP BY docid {})
@@ -30,13 +30,20 @@ def getQueryTemplate(self):
         conjunctive = 'HAVING COUNT(distinct termid) = {}'
         if self.args.disjunctive:
             conjunctive = ''
-        queryTemplate = queryTemplate.format('{}', conjunctive)
+        queryTemplate = queryTemplate.format('{}', collection_size, avg_doc_len, conjunctive)
         return queryTemplate
 
     def search(self):
         topics = self.topicReader.get_topics()
         ofile = open(self.args.output, 'w+')
         print("SCORING TOPICS")
+
+        self.cursor.execute("SELECT COUNT(*) FROM docs;")
+        collection_size = self.cursor.fetchone()[0]
+
+        self.cursor.execute("SELECT ROUND(AVG(len), 2) FROM docs;")
+        avg_doc_len = self.cursor.fetchone()[0]
+
         for topic in topics:
             query_terms = topic['title'].split(" ")
             ids = [] 
@@ -47,9 +54,9 @@ def search(self):
                     ids.append(str(term_id[0]))
             term_ids = ", ".join(ids)
             if self.args.disjunctive:
-                sql_query = self.getQueryTemplate().format(term_ids)
+                sql_query = self.getQueryTemplate(collection_size, avg_doc_len).format(term_ids)
             else:
-                sql_query = self.getQueryTemplate().format(term_ids, len(ids))
+                sql_query = self.getQueryTemplate(collection_size, avg_doc_len).format(term_ids, len(ids))
             self.cursor.execute(sql_query)
             output = self.cursor.fetchall()
             for rank, row in enumerate(output):