BlueBrain · FrancescoCasalegno · Dec 3, 2020 · Nov 30, 2020 · Nov 30, 2020 · Nov 30, 2020
diff --git a/.bandit b/.bandit
@@ -0,0 +1 @@
+skips: [B322]
diff --git a/src/bbsearch/database/cord_19.py b/src/bbsearch/database/cord_19.py
@@ -37,7 +37,7 @@ def mark_bad_sentences(engine, sentences_table_name):
 
     logger.info("Getting all sentences")
     with engine.begin() as connection:
-        query = f"SELECT sentence_id, text FROM {sentences_table_name}"
+        query = f"SELECT sentence_id, text FROM {sentences_table_name}"  # nosec
         df_sentences = pd.read_sql(query, connection)
 
     logger.info("Computing text lengths")

diff --git a/src/bbsearch/database/mining_cache.py b/src/bbsearch/database/mining_cache.py
@@ -285,7 +285,7 @@ def _delete_rows(self):
             DELETE
             FROM {self.target_table}
             WHERE mining_model = :mining_model
-            """
+            """  # nosec
             self.engine.execute(
                 sqlalchemy.sql.text(query),
                 mining_model=model_schema["model_path"],

diff --git a/src/bbsearch/mining/attribute.py b/src/bbsearch/mining/attribute.py
@@ -580,7 +580,7 @@ def get_core_nlp_analysis(self, text):
             response = requests.post(
                 self.core_nlp_url + request_params, data=request_data
             )
-            assert response.status_code == 200
+            response.raise_for_status()
             response_json = json.loads(response.text)
         except requests.exceptions.RequestException:
             warnings.warn("There was a problem contacting the CoreNLP server.")
@@ -801,7 +801,8 @@ def __init__(self, texts, attribute_extractor, ee_model):
         """
         super().__init__()
 
-        assert len(texts) > 0
+        if not texts:
+            raise TypeError("texts must be a non-empty list.")
         self.texts = texts
 
         self.idx_slider = widgets.IntSlider(

diff --git a/src/bbsearch/mining/entity.py b/src/bbsearch/mining/entity.py
@@ -1,5 +1,6 @@
 """Classes and functions for entity extraction (aka named entity recognition)."""
 
+import ast
 import copy
 
 import numpy as np
@@ -198,6 +199,8 @@ def to_jsonl(self, path, sort_by=None):
 
         Parameters
         ----------
+        path : pathlib.Path
+            File where to save it.
         sort_by : None or list
             If None, then no sorting taking place. If ``list``, then the
             names of columns along which to sort.
@@ -326,11 +329,16 @@ def row2raw(row):
                 ):
                     raise KeyError()
 
-                value = (
-                    eval(f"{value_type}({value_str})")
-                    if value_type != "str"
-                    else value_str
-                )
+                if value_type != "str":
+                    try:
+                        value = ast.literal_eval(value_str)
+                    except ValueError as ve:
+                        if str(ve).startswith("malformed node or string"):
+                            raise NameError(str(ve)) from ve
+                        else:
+                            raise
+                else:
+                    value = value_str
 
                 token_pattern = {attribute: value}
                 if op:

diff --git a/src/bbsearch/mining/eval.py b/src/bbsearch/mining/eval.py
@@ -441,7 +441,11 @@ def ner_errors(
               ...
             }
     """
-    assert len(iob_true) == len(iob_pred) == len(tokens)
+    if not (len(iob_true) == len(iob_pred) == len(tokens)):
+        raise ValueError(
+            f"Inputs iob_true (len={len(iob_true)}), iob_pred (len={len(iob_pred)}), "
+            f"tokens (len={len(tokens)}) should have equal length."
+        )
     etypes = unique_etypes(iob_true)
 
     etypes_map = etypes_map if etypes_map is not None else dict()

diff --git a/src/bbsearch/mining/relation.py b/src/bbsearch/mining/relation.py
@@ -137,29 +137,29 @@ def annotate(doc, sent, ent_1, ent_2, etype_symbols):
     tokens = []
     i = sent.start
     while i < sent.end:
-        new_token = " "  # hack to keep the punctuation nice
+        new_tkn = " "  # hack to keep the punctuation nice
 
         if ent_1.start == i:
             start, end = ent_1.start, ent_1.end
-            new_token += (
+            new_tkn += (
                 etype_symbols[etype_1][0]
                 + doc[start:end].text
                 + etype_symbols[etype_1][1]
             )
 
         elif ent_2.start == i:
             start, end = ent_2.start, ent_2.end
-            new_token += (
+            new_tkn += (
                 etype_symbols[etype_2][0]
                 + doc[start:end].text
                 + etype_symbols[etype_2][1]
             )
 
         else:
             start, end = i, i + 1
-            new_token = doc[i].text if doc[i].is_punct else new_token + doc[i].text
+            new_tkn = doc[i].text if doc[i].is_punct else new_tkn + doc[i].text
 
-        tokens.append(new_token)
+        tokens.append(new_tkn)
         i += end - start
 
     return "".join(tokens).strip()

diff --git a/src/bbsearch/sql.py b/src/bbsearch/sql.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import sqlalchemy.sql as sql
 
 
 def get_titles(article_ids, engine):
@@ -23,13 +24,16 @@ def get_titles(article_ids, engine):
     if len(article_ids) == 0:
         return {}
 
-    query = f"""\
-    SELECT article_id, title
-    FROM articles
-    WHERE article_id IN ({",".join(map(str, article_ids))})
-    """
+    query = sql.text(
+        """SELECT article_id, title
+        FROM articles
+        WHERE article_id IN :article_ids
+        """
+    )
+    query = query.bindparams(sql.bindparam("article_ids", expanding=True))
+
     with engine.begin() as connection:
-        response = connection.execute(query).fetchall()
+        response = connection.execute(query, {"article_ids": article_ids}).fetchall()
         titles = {article_id: title for article_id, title in response}
 
     return titles
@@ -72,16 +76,21 @@ def retrieve_sentences_from_sentence_ids(sentence_ids, engine, keep_order=False)
         Pandas DataFrame containing all sentences and their corresponding metadata:
         article_id, sentence_id, section_name, text, paragraph_pos_in_article.
     """
-    sentence_ids_s = ", ".join(str(id_) for id_ in sentence_ids)
-    sentence_ids_s = sentence_ids_s or "NULL"
-    sql_query = f"""
-    SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article
-    FROM sentences
-    WHERE sentence_id IN ({sentence_ids_s})
-    """
+    sql_query = sql.text(
+        """
+        SELECT article_id, sentence_id, section_name, text, paragraph_pos_in_article
+        FROM sentences
+        WHERE sentence_id IN :sentence_ids
+        """
+    )
+    sql_query = sql_query.bindparams(sql.bindparam("sentence_ids", expanding=True))
 
     with engine.begin() as connection:
-        df_sentences = pd.read_sql(sql_query, connection)
+        df_sentences = pd.read_sql(
+            sql_query,
+            params={"sentence_ids": [int(id_) for id_ in sentence_ids]},
+            con=connection,
+        )
 
     if keep_order:
         # Remove sentence IDs that were not found, otherwise df.loc will fail.
@@ -112,19 +121,23 @@ def retrieve_paragraph_from_sentence_id(sentence_id, engine):
         sentence_id. If None then the `sentence_id` was not found in the
         sentences table.
     """
-    sql_query = f"""SELECT text
+    sql_query = sql.text(
+        """SELECT text
                     FROM sentences
                     WHERE article_id =
                         (SELECT article_id
                         FROM sentences
-                        WHERE sentence_id = {sentence_id})
+                        WHERE sentence_id = :sentence_id )
                     AND paragraph_pos_in_article =
                         (SELECT paragraph_pos_in_article
                         FROM sentences
-                        WHERE sentence_id = {sentence_id})
+                        WHERE sentence_id = :sentence_id )
                     ORDER BY sentence_pos_in_paragraph ASC"""
+    )
 
-    all_sentences = pd.read_sql(sql_query, engine)["text"].to_list()
+    all_sentences = pd.read_sql(
+        sql_query, engine, params={"sentence_id": int(sentence_id)}
+    )["text"].to_list()
     if not all_sentences:
         paragraph = None
     else:
@@ -151,13 +164,22 @@ def retrieve_paragraph(article_id, paragraph_pos_in_article, engine):
         pd.DataFrame with the paragraph and its metadata:
         article_id, text, section_name, paragraph_pos_in_article.
     """
-    sql_query = f"""SELECT section_name, text
+    sql_query = sql.text(
+        """SELECT section_name, text
                     FROM sentences
-                    WHERE article_id = {article_id}
-                    AND paragraph_pos_in_article = {paragraph_pos_in_article}
+                    WHERE article_id = :article_id
+                    AND paragraph_pos_in_article = :paragraph_pos_in_article
                     ORDER BY sentence_pos_in_paragraph ASC"""
-
-    sentences = pd.read_sql(sql_query, engine)
+    )
+
+    sentences = pd.read_sql(
+        sql_query,
+        engine,
+        params={
+            "article_id": int(article_id),
+            "paragraph_pos_in_article": int(paragraph_pos_in_article),
+        },
+    )
     if sentences.empty:
         paragraph = pd.DataFrame(
             columns=["article_id", "text", "section_name", "paragraph_pos_in_article"]
@@ -199,10 +221,12 @@ def retrieve_article_metadata_from_article_id(article_id, engine):
         'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id',
         'pdf_json_files', 'pmc_json_files', 'url', 's2_id'.
     """
-    sql_query = f"""SELECT *
+    sql_query = sql.text(
+        """SELECT *
                     FROM articles
-                    WHERE article_id = {article_id}"""
-    article = pd.read_sql(sql_query, engine)
+                    WHERE article_id = :article_id"""
+    )
+    article = pd.read_sql(sql_query, engine, params={"article_id": int(article_id)})
     return article
 
 
@@ -222,14 +246,17 @@ def retrieve_articles(article_ids, engine):
         DataFrame containing the articles divided into paragraphs. The columns are
         'article_id', 'paragraph_pos_in_article', 'text', 'section_name'.
     """
-    articles_str = ", ".join(str(id_) for id_ in article_ids)
-    sql_query = f"""SELECT *
+    article_ids = [int(id_) for id_ in article_ids]
+    sql_query = sql.text(
+        """SELECT *
                     FROM sentences
-                    WHERE article_id IN ({articles_str})
+                    WHERE article_id IN :articles_ids
                     ORDER BY article_id ASC,
                     paragraph_pos_in_article ASC,
                     sentence_pos_in_paragraph ASC"""
-    all_sentences = pd.read_sql(sql_query, engine)
+    )
+    sql_query = sql_query.bindparams(sql.bindparam("articles_ids", expanding=True))
+    all_sentences = pd.read_sql(sql_query, engine, params={"articles_ids": article_ids})
 
     groupby_var = all_sentences.groupby(by=["article_id", "paragraph_pos_in_article"])
     paragraphs = groupby_var["text"].apply(lambda x: " ".join(x))
@@ -260,21 +287,27 @@ def retrieve_mining_cache(identifiers, model_names, engine):
     result : pd.DataFrame
         Selected rows of the `mining_cache` table.
     """
-    model_names = tuple(set(model_names))
-    if len(model_names) == 1:
-        model_names = f"('{model_names[0]}')"
+    model_names = list(set(model_names))
+    identifiers_arts = [int(a) for a, p in identifiers if p == -1]
 
-    identifiers_arts = tuple(a for a, p in identifiers if p == -1)
-    if len(identifiers_arts) == 1:
-        identifiers_arts = f"({identifiers_arts[0]})"
     if identifiers_arts:
-        query_arts = f"""
+        query_arts = sql.text(
+            """
         SELECT *
         FROM mining_cache
-        WHERE article_id IN {identifiers_arts} AND mining_model IN {model_names}
+        WHERE article_id IN :identifiers_arts AND mining_model IN :model_names
         ORDER BY article_id, paragraph_pos_in_article, start_char
         """
-        df_arts = pd.read_sql(query_arts, con=engine)
+        )
+        query_arts = query_arts.bindparams(
+            sql.bindparam("identifiers_arts", expanding=True),
+            sql.bindparam("model_names", expanding=True),
+        )
+        df_arts = pd.read_sql(
+            query_arts,
+            con=engine,
+            params={"identifiers_arts": identifiers_arts, "model_names": model_names},
+        )
     else:
         df_arts = pd.DataFrame()
 
@@ -287,6 +320,8 @@ def retrieve_mining_cache(identifiers, model_names, engine):
         # 3. If `len(identifiers_pars)` is too large, we may have a too long
         #    SQL statement which overflows the max length. So we break it down.
 
+        if len(model_names) == 1:
+            model_names = f"('{model_names[0]}')"
         batch_size = 1000
         dfs_pars = []
         d, r = divmod(len(identifiers_pars), batch_size)
@@ -296,14 +331,14 @@ def retrieve_mining_cache(identifiers, model_names, engine):
                 SELECT *
                 FROM mining_cache
                 WHERE (article_id = {a} AND paragraph_pos_in_article = {p})
-                """
+                """  # nosec
                 for a, p in identifiers_pars[i * batch_size : (i + 1) * batch_size]
             )
             query_pars = f"""
             SELECT *
             FROM ({query_pars}) tt
             WHERE tt.mining_model IN {model_names}
-            """
+            """  # nosec
             dfs_pars.append(pd.read_sql(query_pars, engine))
         df_pars = pd.concat(dfs_pars)
         df_pars = df_pars.sort_values(
@@ -553,7 +588,7 @@ def _build_query(self):
                 FROM articles
                 WHERE {" AND ".join(article_conditions)}
             )
-            """.strip()
+            """.strip()  # nosec
             sentence_conditions.append(article_condition_query)
 
         # Restricted sentence IDs

diff --git a/tests/test_mining/test_eval.py b/tests/test_mining/test_eval.py
@@ -560,6 +560,8 @@ def test_ner_errors(ner_annotations, dataset, mode, errors_expected):
     )
     errors_expected = OrderedDict(errors_expected)
     assert errors_out == errors_expected
+    with pytest.raises(ValueError):
+        ner_errors(iob_true, iob_pred[:-1], tokens)
 
 
 def test_remove_punctuation(punctuation_annotations):

diff --git a/tests/test_sql.py b/tests/test_sql.py
@@ -74,7 +74,7 @@ def test_retrieve_sentence_from_sentence_ids(
             ]
         )
 
-    @pytest.mark.parametrize("sentence_id", [1, 2, 3, 0, -100])
+    @pytest.mark.parametrize("sentence_id", [1, 2, 3, 0, -100, -1, np.int64(2)])
     def test_retrieve_paragraph_from_sentence_id(
         self, sentence_id, fake_sqlalchemy_engine
     ):
@@ -85,7 +85,7 @@ def test_retrieve_paragraph_from_sentence_id(
         sentence_text = retrieve_sentences_from_sentence_ids(
             sentence_ids=(sentence_id,), engine=fake_sqlalchemy_engine
         )
-        if sentence_id == 0 or sentence_id == -100:  # invalid sentence_id
+        if sentence_id in [0, -100, -1]:  # invalid sentence_id
             assert paragraph is None
         else:
             assert isinstance(paragraph, str)

diff --git a/tox.ini b/tox.ini
@@ -22,7 +22,7 @@ commands =
 [testenv:lint]
 skip_install = true
 deps =
-;    bandit
+    bandit
     black==20.8b1
     flake8==3.8.4
     isort==5.6.4
@@ -32,7 +32,7 @@ commands =
     isort --profile black --check setup.py {[tox]source} tests
     pydocstyle {[tox]source}
     black -q --check setup.py {[tox]source} tests
-;    bandit -q -r {[tox]source}
+    bandit -c .bandit -q -r {[tox]source}
 
 [testenv:format]
 skip_install = true