diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index bdfc3a44f..1c6e6d50e 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -342,7 +342,7 @@ def make_text_data(self): class ImportDocuments: META_DATA_FILE_KEY = "Text file" # this is what we will merge meta data on, change to user-set variable - CONLLU_META_DATA = "ID" + CONLLU_META_DATA = ["ID", "Text_ID"] def __init__( self, @@ -520,13 +520,17 @@ def _add_metadata(self, corpus: Corpus) -> Corpus: or self._meta_data is None or ( self.META_DATA_FILE_KEY not in self._meta_data.columns - and self.CONLLU_META_DATA not in self._meta_data.columns + and not any(i in self._meta_data.columns for i in + self.CONLLU_META_DATA) ) ): return corpus if self.is_conllu: - df = self._meta_data.set_index(self.CONLLU_META_DATA) + # find the first matching column + match_id = next((idx for idx in self.CONLLU_META_DATA if idx in + self._meta_data.columns)) + df = self._meta_data.set_index(match_id) path_column = corpus.get_column("utterance") else: df = self._meta_data.set_index( diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 3032fe331..16f68c2b5 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -247,6 +247,8 @@ def test_conllu_reader(self): self.assertEqual(len(corpus), len(lemma)) self.assertEqual(len(corpus), len(pos)) self.assertEqual(len(corpus), len(ner)) + self.assertTrue(np.any(~np.isnan(corpus.get_column( + "Speaker_birth")))) @patch(SF_LIST, return_value=SPECIAL_CHAR_FILES) @patch(PATCH_METHOD, side_effect=ConnectTimeout("test message", request="")) diff --git a/orangecontrib/text/widgets/tests/data/conllu/ParlaMint01-meta.tsv b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint01-meta.tsv new file mode 100755 index 000000000..21f82112b --- /dev/null +++ b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint01-meta.tsv @@ -0,0 +1,3 @@ +ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth +ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Chairperson MP DeSUS Demokratična stranka upokojencev Slovenije Kotnik Poropat, Marjana F 1944 +ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Regular Session 1, 1.8.2014 2014-08-01 2014-08-01 Lower house 7 1 Reference Regular MP SD Socialni demokrati Veber, Janko M 1960 diff --git a/orangecontrib/text/widgets/tests/data/conllu/ParlaMint02-meta.tsv b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint02-meta.tsv new file mode 100755 index 000000000..982b2e0b1 --- /dev/null +++ b/orangecontrib/text/widgets/tests/data/conllu/ParlaMint02-meta.tsv @@ -0,0 +1,4 @@ +ID Title From To House Term Session Meeting Sitting Agenda Subcorpus Speaker_role Speaker_type Speaker_party Speaker_party_name Party_status Speaker_name Speaker_gender Speaker_birth +ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u1 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967 +ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u2 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Chairperson MP SMC Stranka Mira Cerarja Brglez, Milan M 1967 +ParlaMint-SI_2014-08-25-SDZ7-Izredna-01.u3 Minutes of the National Assembly of the Republic of Slovenia, Term 7, Extraordinary Session 1, 25.8.2014 2014-08-25 2014-08-25 Lower house 7 1 Reference Regular MP SD Socialni demokrati Židan, Dejan M 1967