Merge pull request #5 from ldingemans/develop

Develop
ldingemans · Jun 8, 2023 · ca9f59d · ca9f59d
2 parents b84731f + 024d446
commit ca9f59d
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 5 deletions.
diff --git a/phenoscore/hpo_phenotype/calc_hpo_sim.py b/phenoscore/hpo_phenotype/calc_hpo_sim.py
@@ -7,6 +7,7 @@
 import copy
 import obonet
 
+
 class SimScorer:
     def __init__(self, scoring_method='Resnik', sum_method='BMA'):
         """
@@ -97,8 +98,8 @@ def _init_calc_similarity(self, scoring_method, sum_method):
         hpo_network, alt2prim, disease_records = generate_annotated_hpo_network(obo_file,
                                                                                 disease_to_phenotype_file, )
 
-        url = 'http://purl.obolibrary.org/obo/hp.obo'
-        full_hpo_graph = obonet.read_obo(url)
+        file_path = os.path.join(os.path.expanduser("~"), '.phenopy', 'data', 'hp.obo')
+        full_hpo_graph = obonet.read_obo(file_path)
 
         #the phenopy hpo_network does not included some terms like inheritance etc since they are not phenotypes
         #for name/id to name/id dict we need all

diff --git a/phenoscore/models/svm.py b/phenoscore/models/svm.py
@@ -181,7 +181,7 @@ def svm_class(X_train, y_train, X_test):
 
     if len(X_train) < 10:
         param_grid = {'C': [1e-3, 1, 1e3]}
-        clf = GridSearchCV(LogisticRegression(penalty='l1', max_iter=1000000, solver='liblinear'),
+        clf = GridSearchCV(LogisticRegression(max_iter=1000000, solver='liblinear'),
                            param_grid, cv=LeaveOneOut(), n_jobs=-1, scoring='neg_brier_score')
     else:
         param_grid = {'C': [1e-5, 1e-3, 1, 1e3, 1e5]}

diff --git a/phenoscore/phenoscorer.py b/phenoscore/phenoscorer.py
@@ -325,13 +325,17 @@ def predict_new_sample(self, original_X, original_y, img, hpo_all_new_sample, li
             get_clf(original_X, original_y, self._simscorer, self.mode, None)
 
         if self.mode != 'face':
+
             filtered_hpo = self._simscorer.filter_hpo_df(hpo_all_new_sample)
 
-            assert len(hpo_terms_pt) == len(hpo_terms_cont)
+            if len(hpo_terms_pt) != len(hpo_terms_cont):
+                print("WARNING: Number of HPO terms for patients and controls is not equal.")
 
             avg_pt, avg_cont = [], []
 
             for i in range(len(hpo_terms_pt)):
+                hpo_terms_pt[i], hpo_terms_cont[i] = self._simscorer.filter_hpo_df(
+                    hpo_terms_pt[i]), self._simscorer.filter_hpo_df(hpo_terms_cont[i])
                 avg_pt.append(self._simscorer.calc_similarity(filtered_hpo, hpo_terms_pt[i]))
                 avg_cont.append(self._simscorer.calc_similarity(filtered_hpo, hpo_terms_cont[i]))
 

diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
     long_description = fh.read()
 
 setup(name='phenoscore',
-      version='1.0.0',
+      version='1.0.1',
       packages=find_packages(),
       install_requires=requirements,