castorini · prasys · Feb 11, 2022 · Feb 11, 2022 · Feb 12, 2022 · Feb 14, 2022
diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py
@@ -1771,3 +1771,85 @@
         "texts": "wikipedia-dpr"
     }
 }
+
+JASS_INDEX_INFO = {
+    "jass-msmarco-passage-bm25": {
+        "description": "BP reordered JASS impact index of the MS MARCO passage corpus with BM25 scoring",
+        "filename": "jass-index.msmarco-passage.bm25.20220217.5cbb40.tar.gz",
+        "urls": [
+            "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/jass-index.msmarco-passage.bm25.20220217.5cbb40.tar.gz"
+        ],
+        "md5": "9add4b1f754c5f33d31501c65e5e92d3",
+        "size compressed (bytes)": 629101230,
+        "total_terms": 0,
+        "documents": 0,
+        "unique_terms": 0,
+        "downloaded": False
+    },
+    "jass-msmarco-passage-d2q-t5": {
+        "description": "BP reordered JASS impact index of the MS MARCO passage corpus with BM25 scoring over a DocT5Query expanded collection",
+        "filename": "jass-index.msmarco-passage.d2q-t5.20220217.5cbb40.tar.gz",
+        "urls": [
+            "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/jass-index.msmarco-passage.d2q-t5.20220217.5cbb40.tar.gz"
+        ],
+        "md5": "9be8d8890d60410243a8c7323849ecc9",
+        "size compressed (bytes)": 832303111,
+        "total_terms": 0,
+        "documents": 0,
+        "unique_terms": 0,
+        "downloaded": False
+    },
+    "jass-msmarco-passage-deepimpact": {
+        "description": "BP reordered JASS impact index of the MS MARCO passage corpus with DeepImpact scoring",
+        "filename": "jass-index.msmarco-passage.deepimpact.20220217.5cbb40.tar.gz",
+        "urls": [
+            "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/jass-index.msmarco-passage.deepimpact.20220217.5cbb40.tar.gz"
+        ],
+        "md5": "d9ed05d97e1f07373d7a98a1dd9f6fac",
+        "size compressed (bytes)": 1217477634,
+        "total_terms": 0,
+        "documents": 0,
+        "unique_terms": 0,
+        "downloaded": False
+    },
+    "jass-msmarco-passage-unicoil-d2q": {
+        "description": "BP reordered JASS impact index of the MS MARCO passage corpus with uniCOIL scoring over a DocT5Query expanded collection",
+        "filename" : "jass-index.msmarco-passage.unicoil-d2q.20220217.5cbb40.tar.gz",
+        "urls": [
+            "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/jass-index.msmarco-passage.unicoil-d2q.20220217.5cbb40.tar.gz"
+        ],
+        "md5": "24bab2ef23914ab124d4f0eba8dc866c",
+        "size compressed (bytes)": 1084195359,
+        "total_terms": 0,
+        "documents": 0,
+        "unique_terms": 0,
+        "downloaded": False
+    },
+    "jass-msmarco-unicoil-tilde": {
+        "description": "BP reordered JASS impact index of the MS MARCO passage corpus with uniCOIL scoring over a TILDE expanded collection",
+        "filename": "jass-index.msmarco-passage.unicoil-tilde.20220217.5cbb40.tar.gz",
+        "urls": [
+            "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/jass-index.msmarco-passage.unicoil-tilde.20220217.5cbb40.tar.gz"
+        ],
+        "md5": "705c3e72cff189265de9b5c509be00a6",
+        "size compressed (bytes)": 1724440877,
+        "total_terms": 0,
+        "documents": 0,
+        "unique_terms": 0,
+        "downloaded": False
+    },
+    "jass-msmarco-passage-distill-splade-max": {
+        "description": "BP reordered JASS impact index of the MS MARCO passage corpus with distill-splade-max scoring",
+        "filename": "jass-index.msmarco-passage.distill-splade-max.20220217.5cbb40.tar.gz",
+        "urls": [
+            "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/jass-index.msmarco-passage.distill-splade-max.20220217.5cbb40.tar.gz"
+        ],
+        "md5": "f6bf3cdf983d4e1aaee8677acbcdb47f",
+        "size compressed (bytes)": 3530600632,
+        "total_terms": 0,
+        "documents": 0,
+        "unique_terms": 0,
+        "downloaded": False
+    }
+}
+
diff --git a/pyserini/search/__init__.py b/pyserini/search/__init__.py
@@ -19,13 +19,12 @@
 from .lucene import JLuceneSearcherResult, LuceneSimilarities, LuceneFusionSearcher, LuceneSearcher
 from .lucene import JImpactSearcherResult, LuceneImpactSearcher
 from ._deprecated import SimpleSearcher, ImpactSearcher, SimpleFusionSearcher
-
 from .faiss import DenseSearchResult, PRFDenseSearchResult, FaissSearcher, BinaryDenseSearcher, QueryEncoder, \
     DprQueryEncoder, BprQueryEncoder, DkrrDprQueryEncoder, TctColBertQueryEncoder, AnceQueryEncoder, AutoQueryEncoder
+from .jass import JASSv2Searcher
 from .faiss import AnceEncoder
 from .faiss import DenseVectorAveragePrf, DenseVectorRocchioPrf, DenseVectorAncePrf
 
-
 __all__ = ['JQuery',
            'LuceneSimilarities',
            'LuceneFusionSearcher',
@@ -51,10 +50,10 @@
            'BprQueryEncoder',
            'DkrrDprQueryEncoder',
            'TctColBertQueryEncoder',
+           'JASSv2Searcher',
            'AnceEncoder',
            'AnceQueryEncoder',
            'AutoQueryEncoder',
            'DenseVectorAveragePrf',
            'DenseVectorRocchioPrf',
-           'DenseVectorAncePrf']
-
+           'DenseVectorAncePrf']
diff --git a/pyserini/search/jass/__init__.py b/pyserini/search/jass/__init__.py
@@ -0,0 +1,20 @@
+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from ._searcher import JASSv2Searcher , JASSv2SearcherResult
+
+
+__all__ = ['JASSv2Searcher', 'JASSv2SearcherResult']
diff --git a/pyserini/search/jass/__main__.py b/pyserini/search/jass/__main__.py
@@ -0,0 +1,111 @@
+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyserini.search import JASSv2Searcher
+import argparse
+import os
+from tqdm import tqdm
+
+from pyserini.output_writer import OutputFormat, get_output_writer
+from pyserini.query_iterator import get_query_iterator, TopicsFormat
+
+
+
+def define_search_args(parser):
+    parser.add_argument('--index', type=str, metavar='path to index or index name', required=True,
+                        help="Path to pyJass index")
+    parser.add_argument('--rho', type=int, default=1000000000, help='rho: how many postings to process')
+    parser.add_argument('--basic-parser', default=False, action='store_true', help="Use the basic query parser; Default is to use the ASCII parser")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Search a pyJass index.')
+    define_search_args(parser)
+    parser.add_argument('--topics', type=str, metavar='topic_name', required=True,
+                        help="Name of topics. Available: robust04, robust05, core17, core18.")
+    parser.add_argument('--hits', type=int, metavar='num',
+                        required=False, default=1000, help="Number of hits.")
+    parser.add_argument('--topics-format', type=str, metavar='format', default=TopicsFormat.DEFAULT.value,
+                        help=f"Format of topics. Available: {[x.value for x in list(TopicsFormat)]}")
+    parser.add_argument('--output-format', type=str, metavar='format', default=OutputFormat.TREC.value,
+                        help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}")
+    parser.add_argument('--output', type=str, metavar='path',
+                        help="Path to output file.")
+    parser.add_argument('--batch-size', type=int, metavar='num', required=False,
+                        default=1, help="Specify batch size to search the collection concurrently.")
+    parser.add_argument('--threads', type=int, metavar='num', required=False,
+                        default=1, help="Maximum number of threads to use.")
+    parser.add_argument('--impact', action='store_true', help="Use Impact.")
+
+    args = parser.parse_args()
+
+    query_iterator = get_query_iterator(args.topics, TopicsFormat(args.topics_format))
+    topics = query_iterator.topics
+
+    if os.path.exists(args.index):
+        searcher = JASSv2Searcher(args.index, 2)
+    else:
+        searcher = JASSv2Searcher.from_prebuilt_index(args.index)
+
+    if not searcher:
+        exit()
+
+    # JASS does not (yet) support field-based retrieval
+    fields = None
+
+    if not args.impact:
+        print("Enforcing --impact; JASS requires impact-based retrieval.")
+
+    # JASS Parser Option 
+    if args.basic_parser:
+        searcher.set_basic_parser()
+
+    # build output path
+    output_path = args.output
+    if output_path is None:
+        tokens = ['run', args.topics, '_'.join(['rho',str(args.rho)]), 'txt'] # we use the rho output
+        output_path = '.'.join(tokens)
+
+    print(f'Running {args.topics} topics, saving to {output_path}...')
+    tag = output_path[:-4] if args.output is None else 'JaSS'
+
+    output_writer = get_output_writer(output_path, OutputFormat(args.output_format), 'w',
+                                      max_hits=args.hits, tag=tag, topics=topics)
+
+    with output_writer:
+        batch_topics = list()
+        batch_topic_ids = list()
+        for index, (topic_id, text) in enumerate(tqdm(query_iterator, total=len(topics.keys()))):
+            if args.batch_size <= 1 and args.threads <= 1:
+                hits = searcher.search(text, args.hits, args.rho)
+                results = [(topic_id, hits)]
+            else:
+                batch_topic_ids.append(str(topic_id))
+                batch_topics.append(text)
+                if (index + 1) % args.batch_size == 0 or \
+                    index == len(topics.keys()) - 1:
+                    results = searcher.batch_search(
+                        batch_topics, batch_topic_ids, args.hits, args.rho, args.threads)
+                    results = [(id_, results[id_]) for id_ in batch_topic_ids]
+                    batch_topic_ids.clear()
+                    batch_topics.clear()
+                else:
+                    continue
+
+            for topic, hits in results:
+                # write results
+                output_writer.write(topic, hits)
+
+            results.clear()