From 6cb7e3b29dd63ab62c71bb52e5a4a4d8044fe20c Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 17 Jun 2019 22:13:14 +0200
Subject: [PATCH 1/2] Really just checking verified commits, but some
 improvements too.

---
 README.md | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index fedbba4..8b687d0 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,31 @@
-# olddog
+# OldDog
 
 The repository contains the code that accompanies the blog post
 [Teaching An Old Dog a new Trick](https://www.chriskamphuis.com/2019/03/06/teaching-an-old-dog-a-new-trick.html).
 
-_olddog_ is build using maven:
+## Preliminaries
 
-`
-mvn clean package appassembler:assemble
-`
+_olddog_ is build using `maven`:
 
-_olddog_ takes a Lucene index as input, this can be created by for example [Anserini](https://github.com/castorini/Anserini). 
-For example, the Robust 04 collection can be indexed as explained on [this](https://github.com/castorini/Anserini/blob/master/docs/experiments-robust04.md) page.
+    mvn clean package appassembler:assemble
 
-After creating the index, the csv files representing the tables can be created issuing the following command:
+_OldDog_ takes a Lucene index as input, for example as created by the [Anserini](https://github.com/castorini/Anserini) project. 
+The Robust 04 collection can be indexed as explained on [this Anserini page](https://github.com/castorini/Anserini/blob/master/docs/experiments-robust04.md).
 
-`
-nohup target/appassembler/bin/nl.ru.convert.Convert -index path/to/index -docs /tmp/docs.csv -dict /tmp/dict.csv -terms /tmp/terms.csv
-`
+## Setup
 
-This creates multiple files that represent the columns of the `docs`, `dict` and `terms` 
-tables as described in the blog post. 
+After creating the index, the CSV files representing the database tables can be created issuing the following command:
+
+    nohup target/appassembler/bin/nl.ru.convert.Convert -index path/to/index -docs /tmp/docs.csv -dict /tmp/dict.csv -terms /tmp/terms.csv
+
+This creates multiple files that represent the columns of the `docs`, `dict` and `terms` tables as described in the blog post. 
 
 The column store relational database [MonetDB](https://www.monetdb.org) can load
-these files using the `copy into` [command](https://www.monetdb.org/Documentation/Cookbooks/SQLrecipes/CSV_bulk_loads).
+these files using the `COPY INTO` [command](https://www.monetdb.org/Documentation/Cookbooks/SQLrecipes/CSV_bulk_loads).
+
+## Usage
 
-After this it is possible issue the query described in the post:
+After this final step it is possible issue the query described in the post:
 
 ```sql
 WITH qterms AS (SELECT termid, docid, df FROM terms                             
@@ -41,4 +42,8 @@ WITH qterms AS (SELECT termid, docid, df FROM terms
 SELECT scores.docid, score FROM (SELECT docid, sum(subscore) AS score           
   FROM subscores GROUP BY docid) AS scores JOIN docs ON                         
   scores.docid=docs.docid ORDER BY score DESC;
-```
\ No newline at end of file
+```
+
+## Team
+
+[**Chris Kamphuis**](https://github.com/chriskamphuis) and [**Arjen de Vries**](https://github.com/arjenpdevries).

From 179a0c5fe169985d0d1f81f2cfeb2ed564952557 Mon Sep 17 00:00:00 2001
From: chriskamphuis <mail@chriskamphuis.com>
Date: Wed, 19 Jun 2019 14:16:41 +0200
Subject: [PATCH 2/2] do not hardcode collection stats

---
 src/main/python/search_collection.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/main/python/search_collection.py b/src/main/python/search_collection.py
index f56d684..76bb5c2 100755
--- a/src/main/python/search_collection.py
+++ b/src/main/python/search_collection.py
@@ -10,13 +10,13 @@
 
 class SearchCollection:
    
-    def getQueryTemplate(self):
+    def getQueryTemplate(self, collection_size, avg_doc_len):
         queryTemplate =  """
             WITH qterms AS (SELECT termid, docid, count FROM terms 
                 WHERE termid IN ({})), 
                 subscores AS (SELECT docs.collection_id, docs.id, len, term_tf.termid, 
-                term_tf.tf, df, (log((528155-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/
-                (term_tf.tf+1.2*(1-0.75+0.75*(len/188.33)))))) AS subscore 
+                term_tf.tf, df, (log(({}-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/
+                (term_tf.tf+1.2*(1-0.75+0.75*(len/{})))))) AS subscore
                 FROM (SELECT termid, docid, count as tf FROM qterms) AS term_tf 
                 JOIN (SELECT docid FROM qterms
                     GROUP BY docid {})
@@ -30,13 +30,20 @@ def getQueryTemplate(self):
         conjunctive = 'HAVING COUNT(distinct termid) = {}'
         if self.args.disjunctive:
             conjunctive = ''
-        queryTemplate = queryTemplate.format('{}', conjunctive)
+        queryTemplate = queryTemplate.format('{}', collection_size, avg_doc_len, conjunctive)
         return queryTemplate
 
     def search(self):
         topics = self.topicReader.get_topics()
         ofile = open(self.args.output, 'w+')
         print("SCORING TOPICS")
+
+        self.cursor.execute("SELECT COUNT(*) FROM docs;")
+        collection_size = self.cursor.fetchone()[0]
+
+        self.cursor.execute("SELECT ROUND(AVG(len), 2) FROM docs;")
+        avg_doc_len = self.cursor.fetchone()[0]
+
         for topic in topics:
             query_terms = topic['title'].split(" ")
             ids = [] 
@@ -47,9 +54,9 @@ def search(self):
                     ids.append(str(term_id[0]))
             term_ids = ", ".join(ids)
             if self.args.disjunctive:
-                sql_query = self.getQueryTemplate().format(term_ids)
+                sql_query = self.getQueryTemplate(collection_size, avg_doc_len).format(term_ids)
             else:
-                sql_query = self.getQueryTemplate().format(term_ids, len(ids))
+                sql_query = self.getQueryTemplate(collection_size, avg_doc_len).format(term_ids, len(ids))
             self.cursor.execute(sql_query)
             output = self.cursor.fetchall()
             for rank, row in enumerate(output):