diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..df3e58e --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/tests/clog.txt b/tests/clog.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/log.txt b/tests/log.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/resources/__init__.py b/tests/resources/__init__.py new file mode 100644 index 0000000..92d2563 --- /dev/null +++ b/tests/resources/__init__.py @@ -0,0 +1,3 @@ +import os + +TEST_PATH = os.path.dirname(__file__) diff --git a/tests/resources/additives.json b/tests/resources/additives.json new file mode 100644 index 0000000..911261b --- /dev/null +++ b/tests/resources/additives.json @@ -0,0 +1,544 @@ +[ + { + "material": "LiSr1-xPO4:Eux", + "parser_output": [ + { + "material_string": "LiSr1-xPO4:Eux", + "material_name": "", + "material_formula": "EuxLiSr1-xPO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "EuxLiSr1-xPO4", + "amount": "1", + "elements": { + "Eu": "x", + "Li": "1", + "Sr": "1-x", + "P": "1", + "O": "4" + }, + "species": { + "Eu": "x", + "Li": "1", + "Sr": "1-x", + "PO4": "1" + } + } + ] + } + ] + }, + { + "material": "NaSrBO3:Tb3+", + "parser_output": [ + { + "material_string": "NaSrBO3:Tb3+", + "material_name": "", + "material_formula": "NaSrBO3", + "additives": [ + "Tb3+" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "NaSrBO3", + "amount": "1", + "elements": { + "Na": "1", + "Sr": "1", + "B": "1", + "O": "3" + }, + "species": { + "Na": "1", + "Sr": "1", + "BO3": "1" + } + } + ] + } + ] + }, + { + "material": "SrAl2O4:Eu0.01", + "parser_output": [ + { + "material_string": "SrAl2O4:Eu0.01", + "material_name": "", + "material_formula": "SrAl2O4", + "additives": [ + "Eu0.01" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "SrAl2O4", + "amount": "1", + "elements": { + "Sr": "1", + "Al": "2", + "O": "4" + }, + "species": { + "Sr": "1", + "Al2O4": "1" + } + } + ] + } + ] + }, + { + "material": "Sr1.97MgSi2O7: Eu0.012+, Dy0.013+, Ho0.013+", + "parser_output": [ + { + "material_string": "Sr1.97MgSi2O7: Eu0.012+, Dy0.013+, Ho0.013+", + "material_name": "", + "material_formula": "Sr1.97MgSi2O7", + "additives": [ + "Eu0.012+", + "Dy0.013+", + "Ho0.013+" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Sr1.97MgSi2O7", + "amount": "1", + "elements": { + "Sr": "1.97", + "Mg": "1", + "Si": "2", + "O": "7" + }, + "species": { + "Sr": "1.97", + "Mg": "1", + "Si2O7": "1" + } + } + ] + } + ] + }, + { + "material": "Ca2BO3Cl:Sm3+, Eu3+", + "parser_output": [ + { + "material_string": "Ca2BO3Cl:Sm3+, Eu3+", + "material_name": "", + "material_formula": "Ca2BO3Cl", + "additives": [ + "Sm3+", + "Eu3+" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ca2BO3Cl", + "amount": "1", + "elements": { + "Ca": "2", + "B": "1", + "O": "3", + "Cl": "1" + }, + "species": { + "Ca": "2", + "BO3": "1", + "Cl": "1" + } + } + ] + } + ] + }, + { + "material": "(K0.16Na0.84)0.5Bi4.5Ti4O15+xwt.% CeO2", + "parser_output": [ + { + "material_string": "(K0.16Na0.84)0.5Bi4.5Ti4O15+xwt.% CeO2", + "material_name": "", + "material_formula": "(1-x)(K0.16Na0.84)0.5Bi4.5Ti4O15-xCeO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(K0.16Na0.84)0.5Bi4.5Ti4O15", + "amount": "1-x", + "elements": { + "K": "0.08", + "Na": "0.42", + "Bi": "4.5", + "Ti": "4", + "O": "15" + }, + "species": { + "K": "0.08", + "Na": "0.42", + "Bi": "4.5", + "Ti": "4", + "O": "15" + } + }, + { + "formula": "CeO2", + "amount": "x", + "elements": { + "Ce": "1", + "O": "2" + }, + "species": { + "Ce": "1", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "Ti0.4V0.3Mn0.15Cr0.15+x wt% LaNi3.55Co0.75Mn0.4Al0.3", + "parser_output": [ + { + "material_string": "Ti0.4V0.3Mn0.15Cr0.15+x wt% LaNi3.55Co0.75Mn0.4Al0.3", + "material_name": "", + "material_formula": "(1-x)Ti0.4V0.3Mn0.15Cr0.15-xLaNi3.55Co0.75Mn0.4Al0.3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ti0.4V0.3Mn0.15Cr0.15", + "amount": "1-x", + "elements": { + "Ti": "0.4", + "V": "0.3", + "Mn": "0.15", + "Cr": "0.15" + }, + "species": { + "Ti": "0.4", + "V": "0.3", + "Mn": "0.15", + "Cr": "0.15" + } + }, + { + "formula": "LaNi3.55Co0.75Mn0.4Al0.3", + "amount": "x", + "elements": { + "La": "1", + "Ni": "3.55", + "Co": "0.75", + "Mn": "0.4", + "Al": "0.3" + }, + "species": { + "La": "1", + "Ni": "3.55", + "Co": "0.75", + "Mn": "0.4", + "Al": "0.3" + } + } + ] + } + ] + }, + { + "material": "Sc2(MoO4)3: x% Eu3+", + "parser_output": [ + { + "material_string": "Sc2(MoO4)3: x% Eu3+", + "material_name": "", + "material_formula": "Sc2(MoO4)3", + "additives": [ + "Eu3+" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Sc2(MoO4)3", + "amount": "1", + "elements": { + "Mo": "3", + "O": "12", + "Sc": "2" + }, + "species": { + "MoO4": "3", + "Sc": "2" + } + } + ] + } + ] + }, + { + "material": "(Na0.5K0.5)NbO3+1.5 mol% CuF2", + "parser_output": [ + { + "material_string": "(Na0.5K0.5)NbO3+1.5 mol% CuF2", + "material_name": "", + "material_formula": "(1-x)(Na0.5K0.5)NbO3-xCuF2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(Na0.5K0.5)NbO3", + "amount": "1-x", + "elements": { + "Na": "0.5", + "K": "0.5", + "Nb": "1", + "O": "3" + }, + "species": { + "Na": "0.5", + "K": "0.5", + "NbO3": "1" + } + }, + { + "formula": "CuF2", + "amount": "x", + "elements": { + "Cu": "1", + "F": "2" + }, + "species": { + "Cu": "1", + "F": "2" + } + } + ] + } + ] + }, + { + "material": "Ca0.90Sr2Al2O6:0.10Eu3+", + "parser_output": [ + { + "material_string": "Ca0.90Sr2Al2O6:0.10Eu3+", + "material_name": "", + "material_formula": "Eu0.10Ca0.90Sr2Al2O6", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Eu0.10Ca0.90Sr2Al2O6", + "amount": "1", + "elements": { + "Eu": "0.10", + "Ca": "0.9", + "Sr": "2", + "Al": "2", + "O": "6" + }, + "species": { + "Eu": "0.10", + "Ca": "0.9", + "Sr": "2", + "Al": "2", + "O": "6" + } + } + ] + } + ] + }, + { + "material": "Ba3-xSiO5:Eux2+", + "parser_output": [ + { + "material_string": "Ba3-xSiO5:Eux2+", + "material_name": "", + "material_formula": "EuxBa3-xSiO5", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "EuxBa3-xSiO5", + "amount": "1", + "elements": { + "Eu": "x", + "Ba": "3-x", + "Si": "1", + "O": "5" + }, + "species": { + "Eu": "x", + "Ba": "3-x", + "Si": "1", + "O": "5" + } + } + ] + } + ] + }, + { + "material": "Ca2-2xLi1+xSiO4F:xCe3+", + "parser_output": [ + { + "material_string": "Ca2-2xLi1+xSiO4F:xCe3+", + "material_name": "", + "material_formula": "CexCa2-2xLi1+xSiO4F", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "CexCa2-2xLi1+xSiO4F", + "amount": "1", + "elements": { + "Ce": "x", + "Ca": "2-2*x", + "Li": "x+1", + "Si": "1", + "O": "4", + "F": "1" + }, + "species": { + "Ce": "x", + "Ca": "2-2*x", + "Li": "x+1", + "SiO4": "1", + "F": "1" + } + } + ] + } + ] + }, + { + "material": "Ba5Si8O21:0.02Eu2+,xDy3+", + "parser_output": [ + { + "material_string": "Ba5Si8O21:0.02Eu2+,xDy3+", + "material_name": "", + "material_formula": "Ba5Si8O21", + "additives": [ + "0.02Eu2+", + "xDy3+" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ba5Si8O21", + "amount": "1", + "elements": { + "Ba": "5", + "Si": "8", + "O": "21" + }, + "species": { + "Ba": "5", + "Si8O21": "1" + } + } + ] + } + ] + }, + { + "material": "Zn1.92-2xYxLixSiO4:0.08Mn2+", + "parser_output": [ + { + "material_string": "Zn1.92-2xYxLixSiO4:0.08Mn2+", + "material_name": "", + "material_formula": "Mn0.08Zn1.92-2xYxLixSiO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Mn0.08Zn1.92-2xYxLixSiO4", + "amount": "1", + "elements": { + "Mn": "0.08", + "Zn": "1.92-2*x", + "Y": "x", + "Li": "x", + "Si": "1", + "O": "4" + }, + "species": { + "Mn": "0.08", + "Zn": "1.92-2*x", + "Y": "x", + "Li": "x", + "SiO4": "1" + } + } + ] + } + ] + } +] \ No newline at end of file diff --git a/tests/resources/cathode.json b/tests/resources/cathode.json new file mode 100644 index 0000000..808c999 --- /dev/null +++ b/tests/resources/cathode.json @@ -0,0 +1,43 @@ +[ + { + "material": "Na2/3Ni1/3Co(1/3-x)Mn1/3AlxO2", + "parser_output": { + "material_string": "Na2/3Ni1/3Co(1/3-x)Mn1/3AlxO2", + "material_name": "", + "material_formula": "Na2/3Ni1/3Co1/3-xMn1/3AlxO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na2/3Ni1/3Co1/3-xMn1/3AlxO2", + "amount": "1", + "elements": { + "Na": "0.667", + "Ni": "0.333", + "Co": "0.333-x", + "Mn": "0.333", + "Al": "x", + "O": "2" + }, + "species": { + "Na": "0.667", + "Ni": "0.333", + "Co": "0.333-x", + "Mn": "0.333", + "Al": "x", + "O": "2" + } + } + ] + } + } +] \ No newline at end of file diff --git a/tests/resources/chemical_names.json b/tests/resources/chemical_names.json new file mode 100644 index 0000000..c3be4b3 --- /dev/null +++ b/tests/resources/chemical_names.json @@ -0,0 +1,875 @@ +[ + { + "material": "manganese (II) nitrate Mn(NO3)2·4H2O", + "parser_output": [ + { + "material_string": "manganese (II) nitrate Mn(NO3)2·4H2O", + "material_name": "manganese (II) nitrate", + "material_formula": "Mn(NO3)2·4H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Mn(NO3)2", + "amount": "1", + "elements": { + "N": "2", + "O": "6", + "Mn": "1" + }, + "species": { + "NO3": "2", + "Mn": "1" + } + }, + { + "formula": "H2O", + "amount": "4", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "Lanthanum strontium manganite (La0.8Sr0.2)0.97MnO3", + "parser_output": [ + { + "material_string": "Lanthanum strontium manganite (La0.8Sr0.2)0.97MnO3", + "material_name": "Lanthanum strontium manganite", + "material_formula": "(La0.8Sr0.2)0.97MnO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(La0.8Sr0.2)0.97MnO3", + "amount": "1", + "elements": { + "La": "0.776", + "Sr": "0.194", + "Mn": "1", + "O": "3" + }, + "species": { + "La": "0.776", + "Sr": "0.194", + "Mn": "1", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "lutetium oxide Lu2O3", + "parser_output": [ + { + "material_string": "lutetium oxide Lu2O3", + "material_name": "lutetium oxide", + "material_formula": "Lu2O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Lu2O3", + "amount": "1", + "elements": { + "Lu": "2", + "O": "3" + }, + "species": { + "Lu": "2", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "molybdenum oxide Ag1/8Pr5/8MoO4", + "parser_output": [ + { + "material_string": "molybdenum oxide Ag1/8Pr5/8MoO4", + "material_name": "molybdenum oxide", + "material_formula": "Ag1/8Pr5/8MoO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ag1/8Pr5/8MoO4", + "amount": "1", + "elements": { + "Ag": "0.125", + "Pr": "0.625", + "Mo": "1", + "O": "4" + }, + "species": { + "Ag": "0.125", + "Pr": "0.625", + "MoO4": "1" + } + } + ] + } + ] + }, + { + "material": "Er chlorides", + "parser_output": [ + { + "material_string": "Er chlorides", + "material_name": "erbium chloride", + "material_formula": "ErCl3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "ErCl3", + "amount": "1", + "elements": { + "Er": "1", + "Cl": "3" + }, + "species": { + "Er": "1", + "Cl": "3" + } + } + ] + } + ] + }, + { + "material": "ferric perchlorate", + "parser_output": [ + { + "material_string": "ferric perchlorate", + "material_name": "ferric perchlorate", + "material_formula": "Fe(ClO4)3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Fe(ClO4)3", + "amount": "1", + "elements": { + "Cl": "3", + "O": "12", + "Fe": "1" + }, + "species": { + "ClO4": "3", + "Fe": "1" + } + } + ] + } + ] + }, + { + "material": "triammonium citrate", + "parser_output": [ + { + "material_string": "triammonium citrate", + "material_name": "triammonium citrate", + "material_formula": "(NH4)3C3H5O(COO)3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(NH4)3C3H5O(COO)3", + "amount": "1", + "elements": { + "N": "3", + "H": "17", + "C": "6", + "O": "7" + }, + "species": { + "NH4": "3", + "C3H5O(COO)3": "1" + } + } + ] + } + ] + }, + { + "material": "ammonium molybdate ((NH4)6Mo7O24·4H2O)", + "parser_output": [ + { + "material_string": "ammonium molybdate ((NH4)6Mo7O24·4H2O)", + "material_name": "ammonium molybdate", + "material_formula": "(NH4)6Mo7O24·4H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(NH4)6Mo7O24", + "amount": "1", + "elements": { + "N": "6", + "H": "24", + "Mo": "7", + "O": "24" + }, + "species": { + "NH4": "6", + "Mo7O24": "1" + } + }, + { + "formula": "H2O", + "amount": "4", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "Copper Sulfate Pentahydrate", + "parser_output": [ + { + "material_string": "Copper Sulfate Pentahydrate", + "material_name": "Copper Sulfate Pentahydrate", + "material_formula": "CuSO4·5H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "CuSO4", + "amount": "1", + "elements": { + "Cu": "1", + "S": "1", + "O": "4" + }, + "species": { + "Cu": "1", + "SO4": "1" + } + }, + { + "formula": "H2O", + "amount": "5", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "antimony pentoxide", + "parser_output": [ + { + "material_string": "antimony pentoxide", + "material_name": "antimony pentoxide", + "material_formula": "Sb2O5", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Sb2O5", + "amount": "1", + "elements": { + "Sb": "2", + "O": "5" + }, + "species": { + "Sb": "2", + "O": "5" + } + } + ] + } + ] + }, + { + "material": "zinc (II) acetate dihydrate", + "parser_output": [ + { + "material_string": "zinc (II) acetate dihydrate", + "material_name": "zinc (II) acetate dihydrate", + "material_formula": "Zn(CH3COO)2·2H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Zn(CH3COO)2", + "amount": "1", + "elements": { + "C": "4", + "H": "6", + "O": "4", + "Zn": "1" + }, + "species": { + "CH3COO": "2", + "Zn": "1" + } + }, + { + "formula": "H2O", + "amount": "2", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "yttrium (III) nitrate hexahydrate", + "parser_output": [ + { + "material_string": "yttrium (III) nitrate hexahydrate", + "material_name": "yttrium (III) nitrate hexahydrate", + "material_formula": "Y(NO3)3·6H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Y(NO3)3", + "amount": "1", + "elements": { + "N": "3", + "O": "9", + "Y": "1" + }, + "species": { + "NO3": "3", + "Y": "1" + } + }, + { + "formula": "H2O", + "amount": "6", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "Titanium Tetraisopropoxide", + "parser_output": [ + { + "material_string": "Titanium Tetraisopropoxide", + "material_name": "Titanium Tetraisopropoxide", + "material_formula": "Ti(OCH(CH3)2)4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ti(OCH(CH3)2)4", + "amount": "1", + "elements": { + "C": "12", + "H": "28", + "O": "4", + "Ti": "1" + }, + "species": { + "OCH(CH3)2": "4", + "Ti": "1" + } + } + ] + } + ] + }, + { + "material": "praseodymium tungstate Cd0.25Pr0.50□0.25WO4", + "parser_output": [ + { + "material_string": "praseodymium tungstate Cd0.25Pr0.50□0.25WO4", + "material_name": "praseodymium tungstate", + "material_formula": "Cd0.25Pr0.50□0.25WO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Cd0.25Pr0.50□0.25WO4", + "amount": "1", + "elements": { + "Cd": "0.25", + "Pr": "0.5", + "□": "0.25", + "W": "1", + "O": "4" + }, + "species": { + "Cd": "0.25", + "Pr": "0.5", + "□": "0.25", + "WO4": "1" + } + } + ] + } + ] + }, + { + "material": "europium (III) nitrate pentahydrate", + "parser_output": [ + { + "material_string": "europium (III) nitrate pentahydrate", + "material_name": "europium (III) nitrate pentahydrate", + "material_formula": "Eu(NO3)3·5H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Eu(NO3)3", + "amount": "1", + "elements": { + "N": "3", + "O": "9", + "Eu": "1" + }, + "species": { + "NO3": "3", + "Eu": "1" + } + }, + { + "formula": "H2O", + "amount": "5", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "gadolinium aluminium borate", + "parser_output": [ + { + "material_string": "gadolinium aluminium borate", + "material_name": "gadolinium aluminium borate", + "material_formula": "GdAl3(BO3)4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "GdAl3(BO3)4", + "amount": "1", + "elements": { + "B": "4", + "O": "12", + "Gd": "1", + "Al": "3" + }, + "species": { + "BO3": "4", + "Gd": "1", + "Al": "3" + } + } + ] + } + ] + }, + { + "material": "Ammonium heptamolybdate", + "parser_output": [ + { + "material_string": "Ammonium heptamolybdate", + "material_name": "Ammonium heptamolybdate", + "material_formula": "(NH4)6Mo7O24", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(NH4)6Mo7O24", + "amount": "1", + "elements": { + "N": "6", + "H": "24", + "Mo": "7", + "O": "24" + }, + "species": { + "NH4": "6", + "Mo7O24": "1" + } + } + ] + } + ] + }, + { + "material": "Al-nitrate hydrate", + "parser_output": [ + { + "material_string": "Al-nitrate hydrate", + "material_name": "Al-nitrate hydrate", + "material_formula": "Al(NO3)3·H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Al(NO3)3", + "amount": "1", + "elements": { + "N": "3", + "O": "9", + "Al": "1" + }, + "species": { + "NO3": "3", + "Al": "1" + } + }, + { + "formula": "H2O", + "amount": "1", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "manganese di-oxide", + "parser_output": [ + { + "material_string": "manganese di-oxide", + "material_name": "manganese di-oxide", + "material_formula": "MnO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "MnO2", + "amount": "1", + "elements": { + "Mn": "1", + "O": "2" + }, + "species": { + "Mn": "1", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "titanium (IV) oxide", + "parser_output": [ + { + "material_string": "titanium (IV) oxide", + "material_name": "titanium (IV) oxide", + "material_formula": "TiO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "TiO2", + "amount": "1", + "elements": { + "Ti": "1", + "O": "2" + }, + "species": { + "Ti": "1", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "lanthanum-calcium chromites-titanates (La1-xCax)1-zCr1-yTiyO3", + "parser_output": [ + { + "material_string": "lanthanum-calcium chromites-titanates (La1-xCax)1-zCr1-yTiyO3", + "material_name": "lanthanum-calcium chromites-titanates", + "material_formula": "(La1-xCax)1-zCr1-yTiyO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + }, + "z": { + "values": [], + "max_value": null, + "min_value": null + }, + "y": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "(La1-xCax)1-zCr1-yTiyO3", + "amount": "1", + "elements": { + "La": "(x-1)*(z-1)", + "Ca": "x*(1-z)", + "Cr": "1-y", + "Ti": "y", + "O": "3" + }, + "species": { + "La": "(x-1)*(z-1)", + "Ca": "x*(1-z)", + "Cr": "1-y", + "Ti": "y", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "YBa2Cu3O7-y (YBCO)", + "parser_output": [ + { + "material_string": "YBa2Cu3O7-y (YBCO)", + "material_name": "(YBCO)", + "material_formula": "YBa2Cu3O7", + "additives": [], + "phase": "", + "oxygen_deficiency": "-", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "YBa2Cu3O7", + "amount": "1", + "elements": { + "Y": "1", + "Ba": "2", + "Cu": "3", + "O": "7" + }, + "species": { + "Y": "1", + "Ba": "2", + "Cu": "3", + "O": "7" + } + } + ] + } + ] + }, + { + "material": "Hydrated tin (II) chloride", + "parser_output": [ + { + "material_string": "Hydrated tin (II) chloride", + "material_name": "Hydrated tin (II) chloride", + "material_formula": "SnCl2·H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "SnCl2", + "amount": "1", + "elements": { + "Sn": "1", + "Cl": "2" + }, + "species": { + "Sn": "1", + "Cl": "2" + } + }, + { + "formula": "H2O", + "amount": "1", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "YAG", + "parser_output": [ + { + "material_string": "YAG", + "material_name": "", + "material_formula": "Y3Al5O12", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Y3Al5O12", + "amount": "1", + "elements": { + "Y": "3", + "Al": "5", + "O": "12" + }, + "species": { + "Y": "3", + "Al": "5", + "O": "12" + } + } + ] + } + ] + }, + { + "material": "ilmenite", + "parser_output": [ + { + "material_string": "ilmenite", + "material_name": "ilmenite", + "material_formula": "FeTiO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "FeTiO3", + "amount": "1", + "elements": { + "Fe": "1", + "Ti": "1", + "O": "3" + }, + "species": { + "Fe": "1", + "TiO3": "1" + } + } + ] + } + ] + } +] \ No newline at end of file diff --git a/tests/resources/comprehensive.json b/tests/resources/comprehensive.json new file mode 100644 index 0000000..0426029 --- /dev/null +++ b/tests/resources/comprehensive.json @@ -0,0 +1,133 @@ +[ + { + "material": "P2-Na7/10Mn3/4Fe(1/4-x-y)NixCoyO2", + "parser_output": [ + { + "material_string": "P2-Na7/10Mn3/4Fe(1/4-x-y)NixCoyO2", + "material_name": "", + "material_formula": "Na7/10Mn3/4Fe(1/4-x-y)NixCoyO2", + "additives": [], + "phase": "P2", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + }, + "y": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na7/10Mn3/4Fe(1/4-x-y)NixCoyO2", + "amount": "1", + "elements": { + "Na": "0.7", + "Mn": "0.75", + "Fe": "0.25-x-y", + "Ni": "x", + "Co": "y", + "O": "2" + }, + "species": { + "Na": "0.7", + "Mn": "0.75", + "Fe": "0.25-x-y", + "Ni": "x", + "Co": "y", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "Mg-doped Na2/3Ni(3/10-x)MgxMn7/10O2", + "parser_output": [ + { + "material_string": "Mg-doped Na2/3Ni(3/10-x)MgxMn7/10O2", + "material_name": "", + "material_formula": "Na2/3Ni3/10-xMgxMn7/10O2", + "additives": [ + "Mg" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na2/3Ni3/10-xMgxMn7/10O2", + "amount": "1", + "elements": { + "Na": "0.667", + "Ni": "0.3-x", + "Mg": "x", + "Mn": "0.7", + "O": "2" + }, + "species": { + "Na": "0.667", + "Ni": "0.3-x", + "Mg": "x", + "Mn": "0.7", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "Al3+-doped (Zn1/6Ba1/6Ca1/6Sr1/6La1/3)TiO3", + "parser_output": [ + { + "material_string": "Al3+-doped (Zn1/6Ba1/6Ca1/6Sr1/6La1/3)TiO3", + "material_name": "", + "material_formula": "(Zn1/6Ba1/6Ca1/6Sr1/6La1/3)TiO3", + "additives": [ + "Al3+" + ], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(Zn1/6Ba1/6Ca1/6Sr1/6La1/3)TiO3", + "amount": "1", + "elements": { + "Zn": "0.167", + "Ba": "0.167", + "Ca": "0.167", + "Sr": "0.167", + "La": "0.333", + "Ti": "1", + "O": "3" + }, + "species": { + "Zn": "0.167", + "Ba": "0.167", + "Ca": "0.167", + "Sr": "0.167", + "La": "0.333", + "TiO3": "1" + } + } + ] + } + ] + } +] \ No newline at end of file diff --git a/tests/resources/formulas.json b/tests/resources/formulas.json new file mode 100644 index 0000000..c76aeb4 --- /dev/null +++ b/tests/resources/formulas.json @@ -0,0 +1,1254 @@ +[ + { + "material": "(La0.8Sr0.2)0.97MnO3", + "parser_output": [ + { + "material_string": "(La0.8Sr0.2)0.97MnO3", + "material_name": "", + "material_formula": "(La0.8Sr0.2)0.97MnO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(La0.8Sr0.2)0.97MnO3", + "amount": "1", + "elements": { + "La": "0.776", + "Sr": "0.194", + "Mn": "1", + "O": "3" + }, + "species": { + "La": "0.776", + "Sr": "0.194", + "Mn": "1", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "Ag1/8Pr5/8MoO4", + "parser_output": [ + { + "material_string": "Ag1/8Pr5/8MoO4", + "material_name": "", + "material_formula": "Ag1/8Pr5/8MoO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ag1/8Pr5/8MoO4", + "amount": "1", + "elements": { + "Ag": "0.125", + "Pr": "0.625", + "Mo": "1", + "O": "4" + }, + "species": { + "Ag": "0.125", + "Pr": "0.625", + "MoO4": "1" + } + } + ] + } + ] + }, + { + "material": "(NH4)3C3H5O(COO)3", + "parser_output": [ + { + "material_string": "(NH4)3C3H5O(COO)3", + "material_name": "", + "material_formula": "(NH4)3C3H5O(COO)3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(NH4)3C3H5O(COO)3", + "amount": "1", + "elements": { + "N": "3", + "H": "17", + "C": "6", + "O": "7" + }, + "species": { + "NH4": "3", + "C3H5O(COO)3": "1" + } + } + ] + } + ] + }, + { + "material": "Li5+xLa3Ta2-xGexO12", + "parser_output": [ + { + "material_string": "Li5+xLa3Ta2-xGexO12", + "material_name": "", + "material_formula": "Li5+xLa3Ta2-xGexO12", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Li5+xLa3Ta2-xGexO12", + "amount": "1", + "elements": { + "Li": "x+5", + "La": "3", + "Ta": "2-x", + "Ge": "x", + "O": "12" + }, + "species": { + "Li": "x+5", + "La": "3", + "Ta": "2-x", + "Ge": "x", + "O": "12" + } + } + ] + } + ] + }, + { + "material": "Ni0.6Mg0.3Mn1.5-xAl0.6+xO4", + "parser_output": [ + { + "material_string": "Ni0.6Mg0.3Mn1.5-xAl0.6+xO4", + "material_name": "", + "material_formula": "Ni0.6Mg0.3Mn1.5-xAl0.6+xO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Ni0.6Mg0.3Mn1.5-xAl0.6+xO4", + "amount": "1", + "elements": { + "Ni": "0.6", + "Mg": "0.3", + "Mn": "1.5-x", + "Al": "x+0.6", + "O": "4" + }, + "species": { + "Ni": "0.6", + "Mg": "0.3", + "Mn": "1.5-x", + "Al": "x+0.6", + "O": "4" + } + } + ] + } + ] + }, + { + "material": "Li1+xMn2-xO4-zFz", + "parser_output": [ + { + "material_string": "Li1+xMn2-xO4-zFz", + "material_name": "", + "material_formula": "Li1+xMn2-xO4-zFz", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + }, + "z": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Li1+xMn2-xO4-zFz", + "amount": "1", + "elements": { + "Li": "x+1", + "Mn": "2-x", + "O": "4-z", + "F": "z" + }, + "species": { + "Li": "x+1", + "Mn": "2-x", + "O": "4-z", + "F": "z" + } + } + ] + } + ] + }, + { + "material": "Cd0.25Pr0.50□0.25WO4", + "parser_output": [ + { + "material_string": "Cd0.25Pr0.50□0.25WO4", + "material_name": "", + "material_formula": "Cd0.25Pr0.50□0.25WO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Cd0.25Pr0.50□0.25WO4", + "amount": "1", + "elements": { + "Cd": "0.25", + "Pr": "0.5", + "□": "0.25", + "W": "1", + "O": "4" + }, + "species": { + "Cd": "0.25", + "Pr": "0.5", + "□": "0.25", + "WO4": "1" + } + } + ] + } + ] + }, + { + "material": "Ba2In2(1-x)Ti2xO5+x□1-x", + "parser_output": [ + { + "material_string": "Ba2In2(1-x)Ti2xO5+x□1-x", + "material_name": "", + "material_formula": "Ba2In2-2xTi2xO5+x□1-x", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Ba2In2-2*xTi2xO5+x□1-x", + "amount": "1", + "elements": { + "Ba": "2", + "In": "2-2*x", + "Ti": "2*x", + "O": "x+5", + "□": "1-x" + }, + "species": { + "Ba": "2", + "In": "2-2*x", + "Ti": "2*x", + "O": "x+5", + "□": "1-x" + } + } + ] + } + ] + }, + { + "material": "Li2SrTa2(1-x)Nb2xO7", + "parser_output": [ + { + "material_string": "Li2SrTa2(1-x)Nb2xO7", + "material_name": "", + "material_formula": "Li2SrTa2-2xNb2xO7", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Li2SrTa2-2*xNb2xO7", + "amount": "1", + "elements": { + "Li": "2", + "Sr": "1", + "Ta": "2-2*x", + "Nb": "2*x", + "O": "7" + }, + "species": { + "Li": "2", + "Sr": "1", + "Ta": "2-2*x", + "Nb": "2*x", + "O": "7" + } + } + ] + } + ] + }, + { + "material": "Li(1+x)Mg3AlSi3(1+x)O10+6.5xF2", + "parser_output": [ + { + "material_string": "Li(1+x)Mg3AlSi3(1+x)O10+6.5xF2", + "material_name": "", + "material_formula": "Lix+1Mg3AlSi3x+3O10+6.5xF2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Lix+1Mg3AlSi3*x+3O10+6.5xF2", + "amount": "1", + "elements": { + "Li": "x+1", + "Mg": "3", + "Al": "1", + "Si": "3*x+3", + "O": "6.5*x+10", + "F": "2" + }, + "species": { + "Li": "x+1", + "Mg": "3", + "Al": "1", + "Si": "3*x+3", + "O": "6.5*x+10", + "F": "2" + } + } + ] + } + ] + }, + { + "material": "SrFe1-xMxO3-z", + "parser_output": [ + { + "material_string": "SrFe1-xMxO3-z", + "material_name": "", + "material_formula": "SrFe1-xMxO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "-", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": { + "M": [] + }, + "composition": [ + { + "formula": "SrFe1-xMxO3", + "amount": "1", + "elements": { + "Sr": "1", + "Fe": "1-x", + "M": "x", + "O": "3" + }, + "species": { + "Sr": "1", + "Fe": "1-x", + "M": "x", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "O2", + "parser_output": [ + { + "material_string": "O2", + "material_name": "oxygen", + "material_formula": "O2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "O2", + "amount": "1", + "elements": { + "O": "2" + }, + "species": { + "O2": "1" + } + } + ] + } + ] + }, + { + "material": "Re0.2Ce0.8O2-d", + "parser_output": [ + { + "material_string": "Re0.2Ce0.8O2-d", + "material_name": "", + "material_formula": "Re0.2Ce0.8O2", + "additives": [], + "phase": "", + "oxygen_deficiency": "-", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Re0.2Ce0.8O2", + "amount": "1", + "elements": { + "Re": "0.2", + "Ce": "0.8", + "O": "2" + }, + "species": { + "Re": "0.2", + "Ce": "0.8", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "(Y,In)BaCo3ZnO7+δ", + "parser_output": [ + { + "material_string": "(Y,In)BaCo3ZnO7+δ", + "material_name": "", + "material_formula": "MBaCo3ZnO7", + "additives": [], + "phase": "", + "oxygen_deficiency": "+", + "amounts_x": {}, + "elements_x": { + "M": [ + "Y", + "In" + ] + }, + "composition": [ + { + "formula": "MBaCo3ZnO7", + "amount": "1", + "elements": { + "M": "1", + "Ba": "1", + "Co": "3", + "Zn": "1", + "O": "7" + }, + "species": { + "M": "1", + "Ba": "1", + "Co": "3", + "Zn": "1", + "O": "7" + } + } + ] + } + ] + }, + { + "material": "Ba0.5Sr0.5Co0.8Fe0.2O3-δ", + "parser_output": [ + { + "material_string": "Ba0.5Sr0.5Co0.8Fe0.2O3-δ", + "material_name": "", + "material_formula": "Ba0.5Sr0.5Co0.8Fe0.2O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "-", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ba0.5Sr0.5Co0.8Fe0.2O3", + "amount": "1", + "elements": { + "Ba": "0.5", + "Sr": "0.5", + "Co": "0.8", + "Fe": "0.2", + "O": "3" + }, + "species": { + "Ba": "0.5", + "Sr": "0.5", + "Co": "0.8", + "Fe": "0.2", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "(La0.2Sr0.8)0.9Ti0.9Fe0.1O3±δ", + "parser_output": [ + { + "material_string": "(La0.2Sr0.8)0.9Ti0.9Fe0.1O3±δ", + "material_name": "", + "material_formula": "(La0.2Sr0.8)0.9Ti0.9Fe0.1O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "±", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(La0.2Sr0.8)0.9Ti0.9Fe0.1O3", + "amount": "1", + "elements": { + "La": "0.18", + "Sr": "0.72", + "Ti": "0.9", + "Fe": "0.1", + "O": "3" + }, + "species": { + "La": "0.18", + "Sr": "0.72", + "Ti": "0.9", + "Fe": "0.1", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "(Sn0.5Zn0.5)GaO3(ZnO)", + "parser_output": [ + { + "material_string": "(Sn0.5Zn0.5)GaO3(ZnO)", + "material_name": "", + "material_formula": "(Sn0.5Zn0.5)GaO3(ZnO)", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(Sn0.5Zn0.5)GaO3(ZnO)", + "amount": "1", + "elements": { + "Sn": "0.5", + "Zn": "1.5", + "O": "4", + "Ga": "1" + }, + "species": { + "Sn": "0.5", + "Zn": "1.5", + "O": "4", + "Ga": "1" + } + } + ] + } + ] + }, + { + "material": "LiMn0.85Fe0.15PO4", + "parser_output": [ + { + "material_string": "LiMn0.85Fe0.15PO4", + "material_name": "", + "material_formula": "LiMn0.85Fe0.15PO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "LiMn0.85Fe0.15PO4", + "amount": "1", + "elements": { + "Li": "1", + "Mn": "0.85", + "Fe": "0.15", + "P": "1", + "O": "4" + }, + "species": { + "Li": "1", + "Mn": "0.85", + "Fe": "0.15", + "PO4": "1" + } + } + ] + } + ] + }, + { + "material": "Ba(Mg(1-x)/3SnxTa2(1-x)/3)O3", + "parser_output": [ + { + "material_string": "Ba(Mg(1-x)/3SnxTa2(1-x)/3)O3", + "material_name": "", + "material_formula": "Ba(Mg1/3-x/3SnxTa2/3-2x/3)O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Ba(Mg1/3-x/3SnxTa2/3-2*x/3)O3", + "amount": "1", + "elements": { + "Mg": "0.333-0.333*x", + "Sn": "x", + "Ta": "0.667-0.667*x", + "Ba": "1", + "O": "3" + }, + "species": { + "Mg": "0.333-0.333*x", + "Sn": "x", + "Ta": "0.667-0.667*x", + "Ba": "1", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "Y0.8Ca0.2Ba2Cu3Oy", + "parser_output": [ + { + "material_string": "Y0.8Ca0.2Ba2Cu3Oy", + "material_name": "", + "material_formula": "Y0.8Ca0.2Ba2Cu3O", + "additives": [], + "phase": "", + "oxygen_deficiency": "±", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Y0.8Ca0.2Ba2Cu3O", + "amount": "1", + "elements": { + "Y": "0.8", + "Ca": "0.2", + "Ba": "2", + "Cu": "3", + "O": "1" + }, + "species": { + "Y": "0.8", + "Ca": "0.2", + "Ba": "2", + "Cu": "3", + "O": "1" + } + } + ] + } + ] + }, + { + "material": "BaCo1-x(Sm/Ce)xO3-δ", + "parser_output": [ + { + "material_string": "BaCo1-x(Sm/Ce)xO3-δ", + "material_name": "", + "material_formula": "BaCo1-xMxO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "-", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": { + "M": [ + "Sm", + "Ce" + ] + }, + "composition": [ + { + "formula": "BaCo1-xMxO3", + "amount": "1", + "elements": { + "Ba": "1", + "Co": "1-x", + "M": "x", + "O": "3" + }, + "species": { + "Ba": "1", + "Co": "1-x", + "M": "x", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "Na1+x+yZr2-yYySixP3-xO12", + "parser_output": [ + { + "material_string": "Na1+x+yZr2-yYySixP3-xO12", + "material_name": "", + "material_formula": "Na1+x+yZr2-yYySixP3-xO12", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + }, + "y": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na1+x+yZr2-yYySixP3-xO12", + "amount": "1", + "elements": { + "Na": "x+y+1", + "Zr": "2-y", + "Y": "y", + "Si": "x", + "P": "3-x", + "O": "12" + }, + "species": { + "Na": "x+y+1", + "Zr": "2-y", + "Y": "y", + "Si": "x", + "P": "3-x", + "O": "12" + } + } + ] + } + ] + }, + { + "material": "(Ni1-xMx)Mn2O4", + "parser_output": [ + { + "material_string": "(Ni1-xMx)Mn2O4", + "material_name": "", + "material_formula": "(Ni1-xMx)Mn2O4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": { + "M": [] + }, + "composition": [ + { + "formula": "(Ni1-xMx)Mn2O4", + "amount": "1", + "elements": { + "Ni": "1-x", + "M": "x", + "Mn": "2", + "O": "4" + }, + "species": { + "Ni": "1-x", + "M": "x", + "Mn": "2", + "O": "4" + } + } + ] + } + ] + }, + { + "material": "(La1-xCax)1-zCr1-yTiyO3", + "parser_output": [ + { + "material_string": "(La1-xCax)1-zCr1-yTiyO3", + "material_name": "", + "material_formula": "(La1-xCax)1-zCr1-yTiyO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + }, + "z": { + "values": [], + "max_value": null, + "min_value": null + }, + "y": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "(La1-xCax)1-zCr1-yTiyO3", + "amount": "1", + "elements": { + "La": "(x-1)*(z-1)", + "Ca": "x*(1-z)", + "Cr": "1-y", + "Ti": "y", + "O": "3" + }, + "species": { + "La": "(x-1)*(z-1)", + "Ca": "x*(1-z)", + "Cr": "1-y", + "Ti": "y", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "La0.4Ba0.6Ti0.6RE0.4O3", + "parser_output": [ + { + "material_string": "La0.4Ba0.6Ti0.6RE0.4O3", + "material_name": "", + "material_formula": "La0.4Ba0.6Ti0.6RE0.4O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": { + "RE": [] + }, + "composition": [ + { + "formula": "La0.4Ba0.6Ti0.6RE0.4O3", + "amount": "1", + "elements": { + "La": "0.4", + "Ba": "0.6", + "Ti": "0.6", + "O": "3", + "RE": "0.4" + }, + "species": { + "La": "0.4", + "Ba": "0.6", + "Ti": "0.6", + "RE": "0.4", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "LiMn2-y-zLiyMzO4-δFδ", + "parser_output": [ + { + "material_string": "LiMn2-y-zLiyMzO4-δFδ", + "material_name": "", + "material_formula": "LiMn2-y-zLiyMzO4-δFδ", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "y": { + "values": [], + "max_value": null, + "min_value": null + }, + "z": { + "values": [], + "max_value": null, + "min_value": null + }, + "δ": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": { + "M": [] + }, + "composition": [ + { + "formula": "LiMn2-y-zLiyMzO4-δFδ", + "amount": "1", + "elements": { + "Li": "y+1", + "Mn": "2-y-z", + "M": "z", + "O": "4-δ", + "F": "δ" + }, + "species": { + "Li": "y+1", + "Mn": "2-y-z", + "M": "z", + "O": "4-δ", + "F": "δ" + } + } + ] + } + ] + }, + { + "material": "Na2/3(CoxNi1/3-xMn2/3)O2", + "parser_output": [ + { + "material_string": "Na2/3(CoxNi1/3-xMn2/3)O2", + "material_name": "", + "material_formula": "Na2/3(CoxNi1/3-xMn2/3)O2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na2/3(CoxNi1/3-xMn2/3)O2", + "amount": "1", + "elements": { + "Co": "x", + "Ni": "0.333-x", + "Mn": "0.667", + "Na": "0.667", + "O": "2" + }, + "species": { + "Co": "x", + "Ni": "0.333-x", + "Mn": "0.667", + "Na": "0.667", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "LiCo1-2xNixMnxO2", + "parser_output": [ + { + "material_string": "LiCo1-2xNixMnxO2", + "material_name": "", + "material_formula": "LiCo1-2xNixMnxO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "LiCo1-2xNixMnxO2", + "amount": "1", + "elements": { + "Li": "1", + "Co": "1-2*x", + "Ni": "x", + "Mn": "x", + "O": "2" + }, + "species": { + "Li": "1", + "Co": "1-2*x", + "Ni": "x", + "Mn": "x", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "Mn(1-3x)In2x□xWO4", + "parser_output": [ + { + "material_string": "Mn(1-3x)In2x□xWO4", + "material_name": "", + "material_formula": "Mn1-3xIn2x□xWO4", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Mn1-3*xIn2x□xWO4", + "amount": "1", + "elements": { + "Mn": "1-3*x", + "In": "2*x", + "□": "x", + "W": "1", + "O": "4" + }, + "species": { + "Mn": "1-3*x", + "In": "2*x", + "□": "x", + "WO4": "1" + } + } + ] + } + ] + }, + { + "material": "Na2/3Ni1/3Co(1/3-x)Mn1/3AlxO2", + "parser_output": [ + { + "material_string": "Na2/3Ni1/3Co(1/3-x)Mn1/3AlxO2", + "material_name": "", + "material_formula": "Na2/3Ni1/3Co1/3-xMn1/3AlxO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na2/3Ni1/3Co1/3-xMn1/3AlxO2", + "amount": "1", + "elements": { + "Na": "0.667", + "Ni": "0.333", + "Co": "0.333-x", + "Mn": "0.333", + "Al": "x", + "O": "2" + }, + "species": { + "Na": "0.667", + "Ni": "0.333", + "Co": "0.333-x", + "Mn": "0.333", + "Al": "x", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "(Cu1/3Nb2/3)1/4Ti(3/4-x)ZrxO2", + "parser_output": [ + { + "material_string": "(Cu1/3Nb2/3)1/4Ti(3/4-x)ZrxO2", + "material_name": "", + "material_formula": "(Cu1/3Nb2/3)1/4Ti3/4-xZrxO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "(Cu1/3Nb2/3)1/4Ti3/4-xZrxO2", + "amount": "1", + "elements": { + "Cu": "0.083", + "Nb": "0.167", + "Ti": "0.75-x", + "Zr": "x", + "O": "2" + }, + "species": { + "Cu": "0.083", + "Nb": "0.167", + "Ti": "0.75-x", + "Zr": "x", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "Nd(Zn1/2Ti(1/2+x))O3+2x", + "parser_output": [ + { + "material_string": "Nd(Zn1/2Ti(1/2+x))O3+2x", + "material_name": "", + "material_formula": "Nd(Zn1/2Tix+1/2)O3+2x", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Nd(Zn1/2Tix+1/2)O3+2x", + "amount": "1", + "elements": { + "Zn": "0.5", + "Ti": "x+0.5", + "Nd": "1", + "O": "2*x+3" + }, + "species": { + "Zn": "0.5", + "Ti": "x+0.5", + "Nd": "1", + "O": "2*x+3" + } + } + ] + } + ] + } +] \ No newline at end of file diff --git a/tests/resources/mixtures.json b/tests/resources/mixtures.json new file mode 100644 index 0000000..de4aaf2 --- /dev/null +++ b/tests/resources/mixtures.json @@ -0,0 +1,1012 @@ +[ + { + "material": "Mn(NO3)2·4H2O", + "parser_output": [ + { + "material_string": "Mn(NO3)2·4H2O", + "material_name": "", + "material_formula": "Mn(NO3)2·4H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Mn(NO3)2", + "amount": "1", + "elements": { + "N": "2", + "O": "6", + "Mn": "1" + }, + "species": { + "NO3": "2", + "Mn": "1" + } + }, + { + "formula": "H2O", + "amount": "4", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "(NH4)6Mo7O24·4H2O", + "parser_output": [ + { + "material_string": "(NH4)6Mo7O24·4H2O", + "material_name": "", + "material_formula": "(NH4)6Mo7O24·4H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(NH4)6Mo7O24", + "amount": "1", + "elements": { + "N": "6", + "H": "24", + "Mo": "7", + "O": "24" + }, + "species": { + "NH4": "6", + "Mo7O24": "1" + } + }, + { + "formula": "H2O", + "amount": "4", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "Zn(CH3COO)2·2H2O", + "parser_output": [ + { + "material_string": "Zn(CH3COO)2·2H2O", + "material_name": "", + "material_formula": "Zn(CH3COO)2·2H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Zn(CH3COO)2", + "amount": "1", + "elements": { + "C": "4", + "H": "6", + "O": "4", + "Zn": "1" + }, + "species": { + "CH3COO": "2", + "Zn": "1" + } + }, + { + "formula": "H2O", + "amount": "2", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "(1-x)Pb(Zr0.52Ti0.48)O3-xBaTiO3", + "parser_output": [ + { + "material_string": "(1-x)Pb(Zr0.52Ti0.48)O3-xBaTiO3", + "material_name": "", + "material_formula": "(1-x)Pb(Zr0.52Ti0.48)O3-xBaTiO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Pb(Zr0.52Ti0.48)O3", + "amount": "1-x", + "elements": { + "Zr": "0.52", + "Ti": "0.48", + "Pb": "1", + "O": "3" + }, + "species": { + "Zr": "0.52", + "Ti": "0.48", + "Pb": "1", + "O": "3" + } + }, + { + "formula": "BaTiO3", + "amount": "x", + "elements": { + "Ba": "1", + "Ti": "1", + "O": "3" + }, + "species": { + "Ba": "1", + "TiO3": "1" + } + } + ] + } + ] + }, + { + "material": "16Na2O·10CaO·xAl2O3·(74-x)SiO2", + "parser_output": [ + { + "material_string": "16Na2O·10CaO·xAl2O3·(74-x)SiO2", + "material_name": "", + "material_formula": "16Na2O-10CaO-xAl2O3-(74-x)SiO2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Na2O", + "amount": "16", + "elements": { + "Na": "2", + "O": "1" + }, + "species": { + "Na": "2", + "O": "1" + } + }, + { + "formula": "CaO", + "amount": "10", + "elements": { + "Ca": "1", + "O": "1" + }, + "species": { + "Ca": "1", + "O": "1" + } + }, + { + "formula": "Al2O3", + "amount": "x", + "elements": { + "Al": "2", + "O": "3" + }, + "species": { + "Al": "2", + "O": "3" + } + }, + { + "formula": "SiO2", + "amount": "74-x", + "elements": { + "Si": "1", + "O": "2" + }, + "species": { + "Si": "1", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "(40-x)BaO·xFe2O3·(60-x)P2O5", + "parser_output": [ + { + "material_string": "(40-x)BaO·xFe2O3·(60-x)P2O5", + "material_name": "", + "material_formula": "(40-x)BaO-xFe2O3-(60-x)P2O5", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "BaO", + "amount": "40-x", + "elements": { + "Ba": "1", + "O": "1" + }, + "species": { + "Ba": "1", + "O": "1" + } + }, + { + "formula": "Fe2O3", + "amount": "x", + "elements": { + "Fe": "2", + "O": "3" + }, + "species": { + "Fe": "2", + "O": "3" + } + }, + { + "formula": "P2O5", + "amount": "60-x", + "elements": { + "P": "2", + "O": "5" + }, + "species": { + "P": "2", + "O": "5" + } + } + ] + } + ] + }, + { + "material": "K2O·3Al2O3·6SiO2·2H2O", + "parser_output": [ + { + "material_string": "K2O·3Al2O3·6SiO2·2H2O", + "material_name": "", + "material_formula": "K2O-3Al2O3-6SiO2·2H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "K2O", + "amount": "1", + "elements": { + "K": "2", + "O": "1" + }, + "species": { + "K": "2", + "O": "1" + } + }, + { + "formula": "Al2O3", + "amount": "3", + "elements": { + "Al": "2", + "O": "3" + }, + "species": { + "Al": "2", + "O": "3" + } + }, + { + "formula": "SiO2", + "amount": "6", + "elements": { + "Si": "1", + "O": "2" + }, + "species": { + "Si": "1", + "O": "2" + } + }, + { + "formula": "H2O", + "amount": "2", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "(1-x)(K0.5Na0.5)NbO3-xBi(Mg0.5Ti0.5)O3", + "parser_output": [ + { + "material_string": "(1-x)(K0.5Na0.5)NbO3-xBi(Mg0.5Ti0.5)O3", + "material_name": "", + "material_formula": "(1-x)(K0.5Na0.5)NbO3-xBi(Mg0.5Ti0.5)O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(K0.5Na0.5)NbO3", + "amount": "1-x", + "elements": { + "K": "0.5", + "Na": "0.5", + "Nb": "1", + "O": "3" + }, + "species": { + "K": "0.5", + "Na": "0.5", + "NbO3": "1" + } + }, + { + "formula": "Bi(Mg0.5Ti0.5)O3", + "amount": "x", + "elements": { + "Mg": "0.5", + "Ti": "0.5", + "Bi": "1", + "O": "3" + }, + "species": { + "Mg": "0.5", + "Ti": "0.5", + "Bi": "1", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "(1-x)(0.948K0.5Na0.5NbO3-0.052LiSbO3)-xBi2O3", + "parser_output": [ + { + "material_string": "(1-x)(0.948K0.5Na0.5NbO3-0.052LiSbO3)-xBi2O3", + "material_name": "", + "material_formula": "(0.948-0.948x)K0.5Na0.5NbO3-(0.052-0.052x)LiSbO3-xBi2O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "K0.5Na0.5NbO3", + "amount": "0.948-0.948*x", + "elements": { + "K": "0.5", + "Na": "0.5", + "Nb": "1", + "O": "3" + }, + "species": { + "K": "0.5", + "Na": "0.5", + "NbO3": "1" + } + }, + { + "formula": "LiSbO3", + "amount": "0.052-0.052*x", + "elements": { + "Li": "1", + "Sb": "1", + "O": "3" + }, + "species": { + "Li": "1", + "SbO3": "1" + } + }, + { + "formula": "Bi2O3", + "amount": "x", + "elements": { + "Bi": "2", + "O": "3" + }, + "species": { + "Bi": "2", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "65GeS2-22Ga2S3-(3-x)La2S3-10AgI-xTm2S3", + "parser_output": [ + { + "material_string": "65GeS2-22Ga2S3-(3-x)La2S3-10AgI-xTm2S3", + "material_name": "", + "material_formula": "65GeS2-22Ga2S3-(3-x)La2S3-10AgI-xTm2S3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "GeS2", + "amount": "65", + "elements": { + "Ge": "1", + "S": "2" + }, + "species": { + "Ge": "1", + "S": "2" + } + }, + { + "formula": "Ga2S3", + "amount": "22", + "elements": { + "Ga": "2", + "S": "3" + }, + "species": { + "Ga": "2", + "S": "3" + } + }, + { + "formula": "La2S3", + "amount": "3-x", + "elements": { + "La": "2", + "S": "3" + }, + "species": { + "La": "2", + "S": "3" + } + }, + { + "formula": "AgI", + "amount": "10", + "elements": { + "Ag": "1", + "I": "1" + }, + "species": { + "Ag": "1", + "I": "1" + } + }, + { + "formula": "Tm2S3", + "amount": "x", + "elements": { + "Tm": "2", + "S": "3" + }, + "species": { + "Tm": "2", + "S": "3" + } + } + ] + } + ] + }, + { + "material": "BaO-B2O3-BaF2", + "parser_output": [ + { + "material_string": "BaO-B2O3-BaF2", + "material_name": "", + "material_formula": "BaO-B2O3-BaF2", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "BaO", + "amount": "1", + "elements": { + "Ba": "1", + "O": "1" + }, + "species": { + "Ba": "1", + "O": "1" + } + }, + { + "formula": "B2O3", + "amount": "1", + "elements": { + "B": "2", + "O": "3" + }, + "species": { + "B": "2", + "O": "3" + } + }, + { + "formula": "BaF2", + "amount": "1", + "elements": { + "Ba": "1", + "F": "2" + }, + "species": { + "Ba": "1", + "F": "2" + } + } + ] + } + ] + }, + { + "material": "0.8BiSmxFe1-xO3-0.2PbTiO3", + "parser_output": [ + { + "material_string": "0.8BiSmxFe1-xO3-0.2PbTiO3", + "material_name": "", + "material_formula": "0.8BiSmxFe1-xO3-0.2PbTiO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "BiSmxFe1-xO3", + "amount": "0.8", + "elements": { + "Bi": "1", + "Sm": "x", + "Fe": "1-x", + "O": "3" + }, + "species": { + "Bi": "1", + "Sm": "x", + "Fe": "1-x", + "O": "3" + } + }, + { + "formula": "PbTiO3", + "amount": "0.2", + "elements": { + "Pb": "1", + "Ti": "1", + "O": "3" + }, + "species": { + "Pb": "1", + "TiO3": "1" + } + } + ] + } + ] + }, + { + "material": "(Ba0.5Sr0.5Co0.8Fe0.2O3)-(BaCe0.5Zr0.3Y0.16Zn0.04O3)", + "parser_output": [ + { + "material_string": "(Ba0.5Sr0.5Co0.8Fe0.2O3)-(BaCe0.5Zr0.3Y0.16Zn0.04O3)", + "material_name": "", + "material_formula": "Ba0.5Sr0.5Co0.8Fe0.2O3-BaCe0.5Zr0.3Y0.16Zn0.04O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "Ba0.5Sr0.5Co0.8Fe0.2O3", + "amount": "1", + "elements": { + "Ba": "0.5", + "Sr": "0.5", + "Co": "0.8", + "Fe": "0.2", + "O": "3" + }, + "species": { + "Ba": "0.5", + "Sr": "0.5", + "Co": "0.8", + "Fe": "0.2", + "O": "3" + } + }, + { + "formula": "BaCe0.5Zr0.3Y0.16Zn0.04O3", + "amount": "1", + "elements": { + "Ba": "1", + "Ce": "0.5", + "Zr": "0.3", + "Y": "0.16", + "Zn": "0.04", + "O": "3" + }, + "species": { + "Ba": "1", + "Ce": "0.5", + "Zr": "0.3", + "Y": "0.16", + "Zn": "0.04", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "Ba(Mg(1-x)/3SnxTa2(1-x)/3)O3", + "parser_output": [ + { + "material_string": "Ba(Mg(1-x)/3SnxTa2(1-x)/3)O3", + "material_name": "", + "material_formula": "Ba(Mg1/3-x/3SnxTa2/3-2x/3)O3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Ba(Mg1/3-x/3SnxTa2/3-2*x/3)O3", + "amount": "1", + "elements": { + "Mg": "0.333-0.333*x", + "Sn": "x", + "Ta": "0.667-0.667*x", + "Ba": "1", + "O": "3" + }, + "species": { + "Mg": "0.333-0.333*x", + "Sn": "x", + "Ta": "0.667-0.667*x", + "Ba": "1", + "O": "3" + } + } + ] + } + ] + }, + { + "material": "Na1+x+yZr2-yYySixP3-xO12", + "parser_output": [ + { + "material_string": "Na1+x+yZr2-yYySixP3-xO12", + "material_name": "", + "material_formula": "Na1+x+yZr2-yYySixP3-xO12", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + }, + "y": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na1+x+yZr2-yYySixP3-xO12", + "amount": "1", + "elements": { + "Na": "x+y+1", + "Zr": "2-y", + "Y": "y", + "Si": "x", + "P": "3-x", + "O": "12" + }, + "species": { + "Na": "x+y+1", + "Zr": "2-y", + "Y": "y", + "Si": "x", + "P": "3-x", + "O": "12" + } + } + ] + } + ] + }, + { + "material": "(1-x)KNN-xBT", + "parser_output": [ + { + "material_string": "(1-x)KNN-xBT", + "material_name": "", + "material_formula": "(1-x)K0.5Na0.5NbO3-xBaTiO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "K0.5Na0.5NbO3", + "amount": "1-x", + "elements": { + "K": "0.5", + "Na": "0.5", + "Nb": "1", + "O": "3" + }, + "species": { + "K": "0.5", + "Na": "0.5", + "NbO3": "1" + } + }, + { + "formula": "BaTiO3", + "amount": "x", + "elements": { + "Ba": "1", + "Ti": "1", + "O": "3" + }, + "species": { + "Ba": "1", + "TiO3": "1" + } + } + ] + } + ] + }, + { + "material": "SmCl3·xH2O", + "parser_output": [ + { + "material_string": "SmCl3·xH2O", + "material_name": "", + "material_formula": "SmCl3·xH2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "SmCl3", + "amount": "1", + "elements": { + "Sm": "1", + "Cl": "3" + }, + "species": { + "Sm": "1", + "Cl": "3" + } + }, + { + "formula": "H2O", + "amount": "x", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "(1-x-y)BaTiO3-xBaBiO3-y(Bi0.5Na0.5)TiO3", + "parser_output": [ + { + "material_string": "(1-x-y)BaTiO3-xBaBiO3-y(Bi0.5Na0.5)TiO3", + "material_name": "", + "material_formula": "(1-x-y)BaTiO3-xBaBiO3-y(Bi0.5Na0.5)TiO3", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "BaTiO3", + "amount": "1-x-y", + "elements": { + "Ba": "1", + "Ti": "1", + "O": "3" + }, + "species": { + "Ba": "1", + "TiO3": "1" + } + }, + { + "formula": "BaBiO3", + "amount": "x", + "elements": { + "Ba": "1", + "Bi": "1", + "O": "3" + }, + "species": { + "Ba": "1", + "Bi": "1", + "O": "3" + } + }, + { + "formula": "(Bi0.5Na0.5)TiO3", + "amount": "y", + "elements": { + "Bi": "0.5", + "Na": "0.5", + "Ti": "1", + "O": "3" + }, + "species": { + "Bi": "0.5", + "Na": "0.5", + "TiO3": "1" + } + } + ] + } + ] + }, + { + "material": "(NH4)6Mo7O24·4H2O", + "parser_output": [ + { + "material_string": "(NH4)6Mo7O24·4H2O", + "material_name": "", + "material_formula": "(NH4)6Mo7O24·4H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "(NH4)6Mo7O24", + "amount": "1", + "elements": { + "N": "6", + "H": "24", + "Mo": "7", + "O": "24" + }, + "species": { + "NH4": "6", + "Mo7O24": "1" + } + }, + { + "formula": "H2O", + "amount": "4", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + }, + { + "material": "CdCl2·2.5H2O", + "parser_output": [ + { + "material_string": "CdCl2·2.5H2O", + "material_name": "", + "material_formula": "CdCl2·2.5H2O", + "additives": [], + "phase": "", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "CdCl2", + "amount": "1", + "elements": { + "Cd": "1", + "Cl": "2" + }, + "species": { + "Cd": "1", + "Cl": "2" + } + }, + { + "formula": "H2O", + "amount": "2.5", + "elements": { + "H": "2", + "O": "1" + }, + "species": { + "H2O": "1" + } + } + ] + } + ] + } +] \ No newline at end of file diff --git a/tests/resources/phases.json b/tests/resources/phases.json new file mode 100644 index 0000000..bfc92df --- /dev/null +++ b/tests/resources/phases.json @@ -0,0 +1,97 @@ +[ + { + "material": "P2-Na2/3(CoxNi1/3-xMn2/3)O2", + "parser_output": [ + { + "material_string": "P2-Na2/3(CoxNi1/3-xMn2/3)O2", + "material_name": "", + "material_formula": "Na2/3(CoxNi1/3-xMn2/3)O2", + "additives": [], + "phase": "P2", + "oxygen_deficiency": "", + "amounts_x": { + "x": { + "values": [], + "max_value": null, + "min_value": null + } + }, + "elements_x": {}, + "composition": [ + { + "formula": "Na2/3(CoxNi1/3-xMn2/3)O2", + "amount": "1", + "elements": { + "Co": "x", + "Ni": "0.333-x", + "Mn": "0.667", + "Na": "0.667", + "O": "2" + }, + "species": { + "Co": "x", + "Ni": "0.333-x", + "Mn": "0.667", + "Na": "0.667", + "O": "2" + } + } + ] + } + ] + }, + { + "material": "γ-MnOOH", + "parser_output": [ + { + "material_string": "γ-MnOOH", + "material_name": "", + "material_formula": "MnOOH", + "additives": [], + "phase": "γ", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "MnOOH", + "amount": "1", + "elements": { + "Mn": "1", + "O": "2", + "H": "1" + }, + "species": {} + } + ] + } + ] + }, + { + "material": "g-MnOOH", + "parser_output": [ + { + "material_string": "g-MnOOH", + "material_name": "", + "material_formula": "MnOOH", + "additives": [], + "phase": "g", + "oxygen_deficiency": "", + "amounts_x": {}, + "elements_x": {}, + "composition": [ + { + "formula": "MnOOH", + "amount": "1", + "elements": { + "Mn": "1", + "O": "2", + "H": "1" + }, + "species": {} + } + ] + } + ] + } +] \ No newline at end of file diff --git a/tests/tde_log.txt b/tests/tde_log.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..07c3e2a --- /dev/null +++ b/tests/test.py @@ -0,0 +1,34 @@ +# coding=utf-8 +import heapq +import json + +from text2chem.regex_parser import RegExParser +from text2chem.parser_pipeline import ParserPipelineBuilder +from text2chem.preprocessing_tools.additives_processing import AdditivesProcessing +from text2chem.preprocessing_tools.chemical_name_processing import ChemicalNameProcessing +from text2chem.preprocessing_tools.mixture_processing import MixtureProcessing +from text2chem.preprocessing_tools.phase_processing import PhaseProcessing +from text2chem.postprocessing_tools.substitute_additives import SubstituteAdditives + + +mp = ParserPipelineBuilder() \ + .add_preprocessing(AdditivesProcessing) \ + .add_preprocessing(ChemicalNameProcessing) \ + .add_preprocessing(PhaseProcessing) \ + .add_preprocessing(MixtureProcessing)\ + .add_postprocessing(SubstituteAdditives)\ + .set_regex_parser(RegExParser)\ + .build() + + +test_string = "(Ba4)Nd28/3Ti(18-y)Ga4y/3O54" + +res = { + "material": test_string, + "parser_output": [mp.parse(test_string).to_dict()] +} + +# P2-Na7/10Mn3/4Fe(1/4-x-y)NixCoyO2 P2-Na0.7Mn0.75Fe0.25-x-yNixCoyO2 +print(json.dumps(res)) + + diff --git a/tests/test_cathodetext2chem.py b/tests/test_cathodetext2chem.py new file mode 100644 index 0000000..4e0955f --- /dev/null +++ b/tests/test_cathodetext2chem.py @@ -0,0 +1,41 @@ +# coding=utf-8 +import json +import os +import unittest + +from cathodedataextractor.cathodetext2chem import ( + CathodeParserPipelineBuilder, + CathodeRegExParser, + CathodeStoichiometricVariablesProcessing +) +from tests.resources import TEST_PATH +from text2chem.preprocessing_tools.additives_processing import AdditivesProcessing +from text2chem.preprocessing_tools.chemical_name_processing import ChemicalNameProcessing +from text2chem.preprocessing_tools.phase_processing import PhaseProcessing +from text2chem.preprocessing_tools.mixture_processing import MixtureProcessing +from text2chem.postprocessing_tools.substitute_additives import SubstituteAdditives + +mp = CathodeParserPipelineBuilder() \ + .add_preprocessing(AdditivesProcessing) \ + .add_preprocessing(ChemicalNameProcessing) \ + .add_preprocessing(PhaseProcessing) \ + .add_preprocessing(MixtureProcessing) \ + .add_postprocessing(SubstituteAdditives) \ + .add_postprocessing(CathodeStoichiometricVariablesProcessing) \ + .set_regex_parser(CathodeRegExParser) \ + .build() + + +class TestCathodeText2chem(unittest.TestCase): + @staticmethod + def return_data(testdata): + for idx, data in enumerate(testdata): + chem_name = data["material"] + output = data["parser_output"] + result = mp.parse(chem_name).to_dict() + yield output, result + + def test(self): + testdata = json.loads(open(os.path.join(TEST_PATH, "cathode.json")).read()) + for output, result in self.return_data(testdata): + self.assertEqual(output, result) diff --git a/tests/test_nlp_abbr.py b/tests/test_nlp_abbr.py new file mode 100644 index 0000000..aadbf3f --- /dev/null +++ b/tests/test_nlp_abbr.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +import unittest +from cathodedataextractor.text import BatteriesTextProcessor +from cathodedataextractor.nlp import AbbreviationDetection + + +class TestAbbreviationDetection(unittest.TestCase): + + def test_abbr(self): + exams = [("... Na[(Mn0.4Fe0.3Ni0.3)1−xTix]O2 (x = 0, MFN) ...", [('MFN', 'NaMn0.4Fe0.3Ni0.3O2')]), + + ("... Na0.6MnO2, Na0.6Mn0.95Fe0.05O2, ... marked as MFT0, MF5, ...", + [('MFT0', 'Na0.6MnO2'), ('MF5', 'Na0.6Mn0.95Fe0.05O2')]), + + ("... O3-NaNi0.45Mn0.3Ti0.2M0.05O2 (M=Nb/Mo/Cr, " + "abbreviated as NMTNb, NMTMo and NMTCr, respectively) ..., ", + [('NMTNb', 'O3-NaTi0.2Nb0.05Mn0.3Ni0.45O2.0'), + ('NMTMo', 'O3-NaTi0.2Mo0.05Mn0.3Ni0.45O2.0'), + ('NMTCr', 'O3-NaTi0.2Cr0.05Mn0.3Ni0.45O2.0')]), + + ("... Na0.67Ni0.31Mn0.67Y0.02O2 (NMY2) ...", [('NMY2', 'Na0.67Y0.02Mn0.67Ni0.31O2')]), + + ("Na0.67Ni0.23Mg0.1Mn0.67O2 ... (denoted as NNMMO-MP)", [('NNMMO-MP', 'Na0.67Mg0.1Mn0.67Ni0.23O2')]), + + ("... h-NM, h-NMC and h-NMC2 are identified as Na0.66MnO2, " + "Na0.65Mn0.9Cu0.1O2 and Na0.63Mn0.8Cu0.2O2, ...", [('h-NM', 'Na0.66MnO2'), + ('h-NMC', 'Na0.65Mn0.9Cu0.1O2'), + ('h-NMC2', 'Na0.63Mn0.8Cu0.2O2')]), + + ("... Na0.67Ni0.33Mn0.67O2 (NM)", [('NM', 'Na0.67Mn0.67Ni0.33O2')]), + + ("(Na2/3Ni1/3Mn2/3O2, P2-NNMO) ", [('P2-NNMO', 'Na0.67Mn0.67Ni0.33O2')]), + + ("... (tetragonal Na3V2(PO4)2O2F, abbreviated as NVPOF) ... ", [('NVPOF', 'Na3V2P2O10F')]), + + ("... Na0.67MnO2, tunnel compound of Na0.44MnO2 and pure ..., " + "i.e. Na0.6Fe0.02Mn0.98O2 and Na0.6Fe0.06Mn0.94O2 ... " + "denoted as T-NM, L-NM, LT-NM, LT-NFM2 and L-NFM6, respectively.", [('T-NM', 'Na0.67MnO2'), + ('L-NM', 'Na0.44MnO2'), + ('LT-NM', 'Na0.6Mn0.98Fe0.02O2'), + ('LT-NFM2', + 'Na0.6Mn0.94Fe0.06O2')]), + + ("... Na3V2P3O12 and Na3V2P2O8F3 (NVPF) .", [('NVPF', 'Na3V2P2O8F3')]) + + ] + for exam in exams: + bp = BatteriesTextProcessor(exam[0], special_normal=True) + abbr = AbbreviationDetection() + self.assertEqual(abbr(' '.join(bp.processed_text)).new_abbreviation, exam[1]) diff --git a/tests/test_nlp_cner.py b/tests/test_nlp_cner.py new file mode 100644 index 0000000..d65eac4 --- /dev/null +++ b/tests/test_nlp_cner.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +import unittest + +from cathodedataextractor.nlp import CNer + + +class TestCNer(unittest.TestCase): + ner = CNer() + + def test_normalized_compound_formula(self): + + cem_normalized_compound_formula = [ + ('Na0.67Ni0.31Mn0.67Y0.02O2(NMY-2)', 'Na0.67Y0.02Mn0.67Ni0.31O2 (NMY-2)'), + ('Na0.67Ni0.28Mn0.67Y0.05O2(NMY-5', 'Na0.67Y0.05Mn0.67Ni0.28O2 (NMY-5'), + ('Na0.66Li0.18Mn0.71Ni0.2Co0.08O2-δ', 'Na0.66Li0.18Mn0.71Co0.08Ni0.2O2'), + ('Na0.66Li0.18Mn0.71Ni0.2Co0.08O2+δ', 'Na0.66Li0.18Mn0.71Co0.08Ni0.2O2'), + ('Na0.7(Mn0.6Ni0.2Mg0.2)O2', 'Na0.7Mg0.2Mn0.6Ni0.2O2'), + ('Na2/3MnO2', 'Na0.67MnO2'), + ('Na9/10Cr1/2Fe1/2O2', 'Na0.9Cr0.5Fe0.5O2'), + ('Na0.8(Li0.33Mn0.67-xTix)O2 (x = 0, 0.05, 0.1, 0.2)', 'Na0.8(Li0.33Mn0.67-xTix)O2 (x = 0, 0.05, 0.1, 0.2)'), + ('Na0.8(Li0.33Mn0.67-xTix)O2(x = 0, 0.05, 0.1, 0.2)', 'Na0.8(Li0.33Mn0.67-xTix)O2 (x = 0, 0.05, 0.1, 0.2)'), + ('Na2/3(Co3+0.15Mn3+0.52Mn4+0.33)O2', 'Na0.67Mn0.85Co0.15O2'), + ('NaxMn0.6Ni0.4O2 (0.75 99%', 'NaH2PO2·H2O,>99%'), + ('Na3PS4()', 'Na3PS4'), + ('NaMg0.67Ru0.33O2 ()', 'NaMg0.67Ru0.33O2'), + ('Na3Ni2Sb1-xRuxO6 (x=0, 0.1, 0.2, and 0.3)', 'Na3Ni2Sb1-xRuxO6 (x=0, 0.1, 0.2, and 0.3)'), + ] + + for cem_pro in cem_normalized_compound_formula: + self.assertEqual(self.ner.normalized_compound_formula(cem_pro[0]), cem_pro[1]) + + def test_is_compound_formula(self): + test2 = [ + ('Na0.67(Ni0.3Mn0.5Fe0.2)1-xZrxO2', ('Na0.67(Ni0.3Mn0.5Fe0.2)1-xZrxO2', ['Na', 'Ni', 'Mn', 'Fe', 'Zr', 'O'])), + ('P2-Na0.67(Ni0.3Mn0.5Fe0.2)0.95Zr0.05O2', ('P2-Na0.67(Ni0.3Mn0.5Fe0.2)0.95Zr0.05O2', ['Na', 'Ni', 'Mn', 'Fe', 'Zr', 'O'])), + ('NaNi0.45Mn0.4Ti0.1Co0.05O2-LiF', ('NaNi0.45Mn0.4Ti0.1Co0.05O2-LiF', ['Na', 'Ni', 'Mn', 'Ti', 'Co', 'O', 'Li', 'F'])), + ('Na0.70Ni0.20Cu0.15Mn(0.65-x)TixO2', ('Na0.70Ni0.20Cu0.15Mn(0.65-x)TixO2', ['Na', 'Ni', 'Cu', 'Mn', 'Ti', 'O'])), + ('Na2/3Ni1/3Co1/3Mn1/3O2', ('Na2/3Ni1/3Co1/3Mn1/3O2', ['Na', 'Ni', 'Co', 'Mn', 'O'])), + ('Na0·667Mn0·667Ni0·333O2', ('Na0·667Mn0·667Ni0·333O2', ['Na', 'Mn', 'Ni', 'O'])), + ('Na0.67Fe0.5-x/2Mn0.5-x/2TixO2 (x = 0, 0.01, 0.05, 0.10)', ('Na0.67Fe0.5-x/2Mn0.5-x/2TixO2 (x = 0, 0.01, 0.05, 0.10)', ['Na', 'Fe', 'Mn', 'Ti', 'O'])), + ('Na0.71Co1-xZnxO2 (0 ≤ x ≤ 0.02)', ('Na0.71Co1-xZnxO2 (0 ≤ x ≤ 0.02)', ['Na', 'Co', 'Zn', 'O'])), + ('P2-Na0.67+xNi0.33Mn0.67O2', ('P2-Na0.67+xNi0.33Mn0.67O2', ['Na', 'Ni', 'Mn', 'O'])), + ('NaxMn1/3Fe1/3Ni1/3O2 (x = 2/3 and 1)', ('NaxMn1/3Fe1/3Ni1/3O2 (x = 2/3 and 1)', ['Na', 'Mn', 'Fe', 'Ni', 'O'])), + ('MxC2O4·xH2O', False), + ('ethanol', False), + ('P2-NaNM', False), + ('Na2CO3', ('Na2CO3', ['Na', 'C', 'O'])), + ] + for t2 in test2: + self.assertEqual(self.ner.is_compound_formula(t2[0]), t2[1]) + + def test_prompt_tag(self): + self.assertEqual(self.ner.prompt_tag('Na2CO3'), 'simple') + self.assertEqual(self.ner.prompt_tag('Na2CO3R'), 'raw_material') + self.assertEqual(self.ner.prompt_tag('NaCl'), 'simple') + self.assertEqual(self.ner.prompt_tag('NaF'), 'simple') + self.assertEqual(self.ner.prompt_tag('NH4Cl'), 'raw_material') + self.assertEqual(self.ner.prompt_tag('Na0·667Mn0·667Ni0·333O2'), 'synthetic') + self.assertEqual(self.ner.prompt_tag('NM00'), 'is_likely_abbreviation') + self.assertEqual(self.ner.prompt_tag('NCMTV'), 'is_likely_abbreviation') + self.assertEqual(self.ner.prompt_tag('NCF'), 'is_likely_abbreviation') + self.assertEqual(self.ner.prompt_tag('EC300J'), 'other') + self.assertEqual(self.ner.prompt_tag('PC'), 'other') + self.assertEqual(self.ner.prompt_tag('PO4'), 'polyatomic_ions') + self.assertEqual(self.ner.prompt_tag('Na-ion'), 'other') + self.assertEqual(self.ner.prompt_tag('Na-N532'), 'is_likely_abbreviation') + self.assertEqual(self.ner.prompt_tag('Nalgene'), 'other') + self.assertEqual(self.ner.prompt_tag('No.166'), 'other') + self.assertEqual(self.ner.prompt_tag('No.54-0894'), 'other') + self.assertEqual(self.ner.prompt_tag('0-NMTO'), 'is_likely_abbreviation') + self.assertEqual(self.ner.prompt_tag('NaMnNiCuFeTiOF'), 'is_likely_abbreviation') + self.assertEqual(self.ner.prompt_tag('Mn-Na-Mn'), 'irregular_shape') + self.assertEqual(self.ner.prompt_tag('P2/O3-NMT3'), 'is_likely_abbreviation') + self.assertEqual(self.ner.prompt_tag('Ti-doped-NNMOF'), 'is_likely_abbreviation') diff --git a/tests/test_nlp_tokenize.py b/tests/test_nlp_tokenize.py new file mode 100644 index 0000000..112ad9f --- /dev/null +++ b/tests/test_nlp_tokenize.py @@ -0,0 +1,994 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +test_nlp_tokenize +~~~~~~~~~~~~~~~~~ + +Tests for tokenization. + +""" +import logging +import unittest +import re + +from chemdataextractor.doc.text import Text, Sentence +from chemdataextractor.nlp.tokenize import WordTokenizer, ChemWordTokenizer, FineWordTokenizer +from chemdataextractor.parse import R +from chemdataextractor.model import QuantityModel +from chemdataextractor.model.units import Dimension, Unit + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + + +class TestWordTokenizer(unittest.TestCase): + """Test the standard word tokenizer.""" + + maxDiff = None + + def setUp(self): + self.t = WordTokenizer() + + def test_final_full_stop(self): + """Test the word tokenizer splits off final full stop only.""" + self.assertEqual( + ['This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.'], + self.t.tokenize('This is Mr. Hoppy\'s tortoise.') + ) + + def test_full_stop_following(self): + """Test the word tokenizer splits off final full stop if followed by brackets or quotes.""" + self.assertEqual( + ['(', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.', ')'], + self.t.tokenize('(This is Mr. Hoppy\'s tortoise.)') + ) + self.assertEqual( + ['"', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.', '"'], + self.t.tokenize('"This is Mr. Hoppy\'s tortoise."') + ) + self.assertEqual( + ['"', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tort.oise', '.', '"'], + self.t.tokenize('"This is Mr. Hoppy\'s tort.oise."') + ) + + def test_dollar(self): + """Test the word tokenizer on dollar symbol.""" + self.assertEqual( + ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'], + self.t.tokenize('On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.') + ) + + def test_quote(self): + """Test the word tokenizer on quotes.""" + self.assertEqual( + ['"', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', '"', 'Slocum', 'said', '.'], + self.t.tokenize('"We beat some pretty good teams to get here," Slocum said.') + ) + + def test_brackets_quotes(self): + """Test the word tokenizer on brackets and quotes.""" + self.assertEqual( + ['Well', ',', 'we', 'could', 'n\'t', 'have', 'this', 'predictable', ',', 'cliche', '-', 'ridden', ',', '"', 'Touched', 'by', 'an', 'Angel', '"', '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', 'n\'t', '.'], + self.t.tokenize('Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.') + ) + + def test_exclamation(self): + """Test the word tokenizer on exclamation mark.""" + self.assertEqual( + ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'], + self.t.tokenize('I cannot cannot work under these conditions!') + ) + + def test_digit_comma(self): + """Test the word tokenizer on commas within numbers.""" + self.assertEqual( + ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'], + self.t.tokenize('The company spent $30,000,000 last year.') + ) + + def test_decimal_number(self): + """Test the word tokenizer on number containing full stop.""" + self.assertEqual( + ['It', '\'s', '2.45', 'cats', 'per', 'mango', '.'], + self.t.tokenize('It\'s 2.45 cats per mango.') + ) + + def test_phone_number(self): + """Test the word tokenizer on phone number containing hyphens""" + self.assertEqual( + ['Call', 'me', 'at', '02-2348-2192', '.'], + self.t.tokenize('Call me at 02-2348-2192.') + ) + + def test_percentage(self): + """Test the word tokenizer on percent sign.""" + self.assertEqual( + ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'], + self.t.tokenize('The company spent 40.75% of its income last year.') + ) + + def test_colon_time(self): + """Test the word tokenizer on colon between digits in a time.""" + self.assertEqual( + ['He', 'arrived', 'at', '3:00', 'pm', '.'], + self.t.tokenize('He arrived at 3:00 pm.') + ) + + def test_word_colon(self): + """Test the word tokenizer on colon after word.""" + self.assertEqual( + ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'], + self.t.tokenize('I bought these items: books, pencils, and pens.') + ) + + def test_digit_comma_space(self): + """Test the word tokenizer on comma between digits with a space.""" + self.assertEqual( + ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'], + self.t.tokenize('Though there were 150, 100 of them were old.') + ) + + def test_digit_comma_multiple(self): + """Test the word tokenizer on comma at end of digits.""" + self.assertEqual( + ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'], + self.t.tokenize('There were 300,000, but that wasn\'t enough.') + ) + + def test_theyll(self): + """Test the word tokenizer on the word they'll.""" + self.assertEqual( + ['They', "'ll", 'save', 'and', 'invest', 'more', '.'], + self.t.tokenize('They\'ll save and invest more.') + ) + + def test_bracket1(self): + """Test the word tokenizer on sentence containing brackets.""" + self.assertEqual( + ['For', 'a', 'few', 'weeks', '(', '>', '24', 'days', ')', '.'], + self.t.tokenize('For a few weeks (>24 days).') + ) + + def test_contractions(self): + """Test the word tokenizer on contractions.""" + self.assertEqual( + ['Lem', 'me', 'in', ',', 'I', 'got', 'ta', 'gim', 'me', 'some', 'things', '.'], + self.t.tokenize('Lemme in, I gotta gimme some things.') + ) + + def test_url(self): + """Test the word tokenizer on URL.""" + self.assertEqual( + ['The', 'address', 'is', 'http://www.chemdataextractor.org'], + self.t.tokenize('The address is http://www.chemdataextractor.org') + ) + + def test_text_sentence(self): + """Test tokenization through the Text and Sentence API.""" + t = Text('Hi, my name is Matt. What is your name?', word_tokenizer=WordTokenizer()) + self.assertEqual( + [['Hi', ',', 'my', 'name', 'is', 'Matt', '.'], ['What', 'is', 'your', 'name', '?']], + [sent.raw_tokens for sent in t.sentences] + ) + + def test_additional_regex(self): + sent = Sentence('See if we cansplit this') + additional_regex = re.compile('(?Pcan)split') + tokens = self.t.get_word_tokens(sent, additional_regex=[additional_regex]) + self.assertEqual(['See', 'if', 'we', 'can', 'split', 'this'], [token.text for token in tokens]) + + +class TestChemTokenizer(unittest.TestCase): + """Test the chemistry-aware word tokenizer.""" + + maxDiff = None + + def setUp(self): + self.t = ChemWordTokenizer() + + def test_tokenise_model(self): + + class Pressure(Dimension): + pass + + class PressureModel(QuantityModel): + dimensions = Pressure() + + class PressureUnit(Unit): + def __init__(self, magnitude=0.0, powers=None): + super(PressureUnit, self).__init__(Pressure(), magnitude, powers) + + class Pascal(PressureUnit): + def convert_value_to_standard(self, value): + return value + + def convert_value_from_standard(self, value): + return value + + def convert_error_to_standard(self, error): + return error + + def convert_error_from_standard(self, error): + return error + + units_dict = {R('Pa', group=0): Pascal} + Pressure.units_dict = units_dict + Pressure.standard_units = Pascal() + + sent = Sentence('The pressure was measured to be 12MPa', models=[PressureModel]) + tokens = self.t.get_word_tokens(sent) + self.assertEqual(['The', 'pressure', 'was', 'measured', 'to', 'be', '12', 'MPa'], + [token.text for token in tokens]) + + def test_final_full_stop(self): + """Test the word tokenizer splits off final full stop only.""" + self.assertEqual( + ['This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.'], + self.t.tokenize('This is Mr. Hoppy\'s tortoise.') + ) + + def test_full_stop_following(self): + """Test the word tokenizer splits off final full stop if followed by brackets or quotes.""" + self.assertEqual( + ['(', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.', ')'], + self.t.tokenize('(This is Mr. Hoppy\'s tortoise.)') + ) + self.assertEqual( + ['"', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.', '"'], + self.t.tokenize('"This is Mr. Hoppy\'s tortoise."') + ) + self.assertEqual( + ['"', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tort.oise', '.', '"'], + self.t.tokenize('"This is Mr. Hoppy\'s tort.oise."') + ) + + def test_dollar(self): + """Test the word tokenizer on dollar symbol.""" + self.assertEqual( + ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'], + self.t.tokenize('On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.') + ) + + def test_quote(self): + """Test the word tokenizer on quotes.""" + self.assertEqual( + ['"', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', '"', 'Slocum', 'said', '.'], + self.t.tokenize('"We beat some pretty good teams to get here," Slocum said.') + ) + + def test_brackets_quotes(self): + """Test the word tokenizer on brackets and quotes.""" + self.assertEqual( + ['Well', ',', 'we', 'could', 'n\'t', 'have', 'this', 'predictable', ',', 'cliche', '-', 'ridden', ',', '"', 'Touched', 'by', 'an', 'Angel', '"', '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', 'n\'t', '.'], + self.t.tokenize('Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.') + ) + + def test_exclamation(self): + """Test the word tokenizer on exclamation mark.""" + self.assertEqual( + ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'], + self.t.tokenize('I cannot cannot work under these conditions!') + ) + + def test_digit_comma(self): + """Test the word tokenizer on commas within numbers.""" + self.assertEqual( + ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'], + self.t.tokenize('The company spent $30,000,000 last year.') + ) + + def test_percentage(self): + """Test the word tokenizer on percent sign.""" + self.assertEqual( + ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'], + self.t.tokenize('The company spent 40.75% of its income last year.') + ) + + def test_colon_time(self): + """Test the word tokenizer on colon between digits in a time.""" + self.assertEqual( + ['He', 'arrived', 'at', '3', ':', '00', 'pm', '.'], + self.t.tokenize('He arrived at 3:00 pm.') + ) + + def test_word_colon(self): + """Test the word tokenizer on colon after word.""" + self.assertEqual( + ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'], + self.t.tokenize('I bought these items: books, pencils, and pens.') + ) + + def test_digit_comma_space(self): + """Test the word tokenizer on comma between digits with a space.""" + self.assertEqual( + ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'], + self.t.tokenize('Though there were 150, 100 of them were old.') + ) + + def test_digit_comma_multiple(self): + """Test the word tokenizer on comma at end of digits.""" + self.assertEqual( + ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'], + self.t.tokenize('There were 300,000, but that wasn\'t enough.') + ) + + def test_theyll(self): + """Test the word tokenizer on the word they'll.""" + self.assertEqual( + ['They', "'ll", 'save', 'and', 'invest', 'more', '.'], + self.t.tokenize('They\'ll save and invest more.') + ) + + def test_bracket1(self): + """Test the word tokenizer on sentence containing brackets.""" + self.assertEqual( + ['For', 'a', 'few', 'weeks', '(', '>', '24', 'days', ')', '.'], + self.t.tokenize('For a few weeks (>24 days).') + ) + + def test_bracket2(self): + """Test the word tokenizer on sentence containing brackets.""" + self.assertEqual( + ['Coumarin', '343', '(', 'C343', ')', 'was', 'added', 'to', 'the', 'mixture', '.'], + self.t.tokenize('Coumarin 343 (C343) was added to the mixture.') + ) + + def test_bracket3(self): + """Test the word tokenizer on sentence containing brackets.""" + self.assertEqual( + ['(', 'Ka', ':', '1.42', '×', '10(10)', 'M-1', 'vs', '1.95', '±', '0.35', '×', '10(10)', 'M-1', ')', 'and', 'increased', '(', '9.9', 'vs', '3.7', '±', '0.4', 'fmol', ')'], + self.t.tokenize('(Ka: 1.42×10(10) M-1 vs 1.95±0.35×10(10) M-1) and increased (9.9 vs 3.7±0.4 fmol)') + ) + + def test_bracket4(self): + """Test the word tokenizer on sentence containing chemical name with commas and brackets.""" + self.assertEqual( + ['Pd-C', 'hydrogenation', 'of', '3,21-diacetoxy-5', 'beta,19-cyclopregna-2,9(11)-diene-4,20-dione', '(', '10', ')'], + self.t.tokenize('Pd-C hydrogenation of 3,21-diacetoxy-5 beta,19-cyclopregna-2,9(11)-diene-4,20-dione (10)') + ) + + def test_bracket5(self): + """Test the word tokenizer on sentence containing chemical name with commas and brackets.""" + self.assertEqual( + ['(2E)-3-[5-(2,3-Dimethoxy-6-methyl-1,4-benzoquinoyl)]-2-nonyl-2-propenoic', 'acid'], + self.t.tokenize('(2E)-3-[5-(2,3-Dimethoxy-6-methyl-1,4-benzoquinoyl)]-2-nonyl-2-propenoic acid') + ) + + def test_bracket6(self): + """Test the word tokenizer on sentence containing chemical name with prime and brackets.""" + self.assertEqual( + ['(2-Amino-3\'-methoxyphenyl)oxanaphthalen-4-one', '(', 'PD', '98059', ')'], + self.t.tokenize('(2-Amino-3\'-methoxyphenyl)oxanaphthalen-4-one (PD 98059)') + ) + + def test_bracket7(self): + """Test the word tokenizer on sentence containing chemical name with brackets.""" + self.assertEqual( + ['The', 'maximum', '(3H)-thymidine', 'incorporation', 'into', 'DNA', '.'], + self.t.tokenize('The maximum (3H)-thymidine incorporation into DNA.') + ) + + def test_comma1(self): + """Test the word tokenizer on sentence containing chemical name with commas.""" + self.assertEqual( + ['With', 'hot', 'acetic', '+', 'p-toluenesulfonic', 'acid', '5', 'underwent', 'rearrangement', 'to', '12-acetoxy-11', 'beta,19-epoxypregn-5-ene-4,20-dione', '(', '8', ')', '.'], + self.t.tokenize('With hot acetic + p-toluenesulfonic acid 5 underwent rearrangement to 12-acetoxy-11 beta,19-epoxypregn-5-ene-4,20-dione (8).') + ) + + def test_comma2(self): + """Test the word tokenizer on sentence containing chemical name with commas.""" + self.assertEqual( + ['N,N-Dimethylformamide', '(', 'DMF', ')', 'was', 'used', 'as', 'solvent', '.'], + self.t.tokenize('N,N-Dimethylformamide (DMF) was used as solvent.') + ) + + def test_comma3(self): + """Test the word tokenizer on sentence containing chemical name with commas.""" + self.assertEqual( + ['1,2-bis(o-aminophenoxy)-ethane-N,N,N\',N\'-tetraacetic', 'acid'], + self.t.tokenize('1,2-bis(o-aminophenoxy)-ethane-N,N,N\',N\'-tetraacetic acid') + ) + + def test_comma4(self): + """Test the word tokenizer on sentence containing chemical name with commas.""" + self.assertEqual( + ['N,N,N\',N\'-tetrakis', '[2-pyridylmethyl]ethylenediamine'], + self.t.tokenize('N,N,N\',N\'-tetrakis [2-pyridylmethyl]ethylenediamine') + ) + + def test_comma5(self): + """Test the word tokenizer on sentence containing chemical name with commas.""" + self.assertEqual( + ['o,o\'-bismyristoyl', 'and', 'α,α\'-dipyridyl'], + self.t.tokenize('o,o\'-bismyristoyl and α,α\'-dipyridyl') + ) + + def test_comma6(self): + """Test the word tokenizer on sentence containing chemical name with a prime character.""" + self.assertEqual( + ['The', 'characteristics', 'of', 'the', 'C(sp,sp2,sp3)', '–', 'H', '⋯', 'F', '–', 'C(sp,sp2,sp3)', 'intermolecular'], + self.t.tokenize('The characteristics of the C(sp,sp2,sp3)–H⋯F–C(sp,sp2,sp3) intermolecular') + ) + + def test_prime(self): + """Test the word tokenizer on sentence containing chemical name with a prime character.""" + self.assertEqual( + ['N,N′-heteroaromatic', 'ancillary', 'ligands'], + self.t.tokenize('N,N′-heteroaromatic ancillary ligands') + ) + + def test_square_bracket(self): + """Test the word tokenizer on sentence containing chemical name with square brackets.""" + self.assertEqual( + ['[Hg(dman)2]', ',', 'with', '[PdCl2(SEt2)2]', 'or', '[{PdCl2(PPh3)}2]'], + self.t.tokenize('[Hg(dman)2], with [PdCl2(SEt2)2] or [{PdCl2(PPh3)}2]') + ) + + def test_curly_bracket(self): + """Test the word tokenizer on sentence containing chemical name with curly brackets.""" + self.assertEqual( + ['bimetallic', 'complexes', ',', '{[Mn(salpn)]2[Fe(CN)5NO]}n', '(', '1', ')', ',', '{[Mn(salpn)(CH3OH)]4[Mn(CN)5NO]}[C(CN)3]·3H2O', '(', '2', ')', ',', '{[Mn(dapsc)][Fe(CN)5NO]·0.5CH3OH·0.25H2O}n', '(', '3', ')', 'and', '{[Mn(salpn)(CH3OH)]4[Fe(CN)5NO]}(ClO4)2·4H2O', '(', '4', ')', ',', 'where', 'salpn2−', '=', 'N,N′-1,3-propylene-bis(salicylideneiminato)', 'dianion', 'and', 'dapsc', '=', '2,6-diacetylpyridine-bis(semicarbazone)'], + self.t.tokenize('bimetallic complexes, {[Mn(salpn)]2[Fe(CN)5NO]}n (1), {[Mn(salpn)(CH3OH)]4[Mn(CN)5NO]}[C(CN)3]·3H2O (2), {[Mn(dapsc)][Fe(CN)5NO]·0.5CH3OH·0.25H2O}n (3) and {[Mn(salpn)(CH3OH)]4[Fe(CN)5NO]}(ClO4)2·4H2O (4), where salpn2− = N,N′-1,3-propylene-bis(salicylideneiminato) dianion and dapsc = 2,6-diacetylpyridine-bis(semicarbazone)') + ) + + def test_bracket_internal_split(self): + """Test the word tokenizer on sentence containing token that should be split internally and from brackets.""" + self.assertEqual( + ['Extensive', 'H', '-', 'bonding', '(', 'F', '⋯', 'H', '–', 'OH', ')', 'links', 'the', 'molecules'], + self.t.tokenize('Extensive H-bonding (F⋯H–OH) links the molecules') + ) + + def test_colon_chem(self): + """Test the word tokenizer on chemical name containing colon.""" + self.assertEqual( + ['Group', '13', 'fluorides', ',', 'MF3·3H2O', '(', 'M', '=', 'Al', ',', 'Ga', 'or', 'In', ')', 'with', '2,2′:6′,2′′-terpyridyl', ',', '2,2′-bipyridyl', 'or', '1,10-phenanthroline', 'under', 'hydrothermal', 'conditions', '(', '180', '°', 'C', '/', '15', 'h', ')'], + self.t.tokenize('Group 13 fluorides, MF3·3H2O (M = Al, Ga or In) with 2,2′:6′,2′′-terpyridyl, 2,2′-bipyridyl or 1,10-phenanthroline under hydrothermal conditions (180 °C/15 h)') + ) + + def test_positive_charge(self): + """Test the word tokenizer on chemical name with trailing plus.""" + self.assertEqual( + ['[⊂Me2N(CH2)2NMe(CH2)2]+', ',', 'with', 'fluorometallate', 'anions'], + self.t.tokenize('[⊂Me2N(CH2)2NMe(CH2)2]+, with fluorometallate anions') + ) + + def test_bracket_subsequent(self): + """Test the word tokenizer on chemical name with subsequent (not nested) brackets.""" + self.assertEqual( + ['The', 'α-ammonium-acylchloride', 'salts', '[NH2(CH2)3CHC(O)Cl][WOCl5]', ',', '1a', ',', 'and', '[MeNH2CH2C(O)Cl][WOCl5]', ',', '1b', '.'], + self.t.tokenize('The α-ammonium-acylchloride salts [NH2(CH2)3CHC(O)Cl][WOCl5], 1a, and [MeNH2CH2C(O)Cl][WOCl5], 1b.') + ) + + def test_bracket_subsequent2(self): + """Test the word tokenizer on chemical name with subsequent (not nested) brackets.""" + self.assertEqual( + ['(N,N)-thingy-(errrm)'], + self.t.tokenize('(N,N)-thingy-(errrm)') + ) + + def test_bracket_range(self): + """Test the word tokenizer on a bracketed range.""" + self.assertEqual( + ['(', 'a', ')', '–', '(', 'c', ')', 'Some', 'things', '.'], + self.t.tokenize('(a)–(c) Some things.') + ) + + def test_space_colon_digit(self): + """Test the word tokenizer on an usual spacing of colon.""" + self.assertEqual( + ['Intensity', 'ratio', 'of', '2', ':', '2', ':', '4', 'and', '54', ':', '18', '.'], + self.t.tokenize('Intensity ratio of 2 :2 :4 and 54 : 18.') + ) + + def test_quote_apostrophe(self): + """Test the word tokenizer when a quote is used as an apostrophe.""" + self.assertEqual( + ['Alzheimer', '\u2019s', 'disease', '(', 'AD', ')', 'was', '‘', 'extremely', '’', 'hard', '‘', 'to', 'diagnose', '.', '’'], + self.t.tokenize('Alzheimer’s disease (AD) was ‘extremely’ hard ‘to diagnose.’') + ) + + def test_quote_apostrophe2(self): + """Test the word tokenizer when a quote is used as an apostrophe.""" + self.assertEqual( + ['Alzheimer', '\u2019s', 'disease', '(', 'AD', ')', 'was', '‘', 'extremely', '’', 'hard', '‘', 'to', 'diagnose', '’', '.'], + self.t.tokenize('Alzheimer’s disease (AD) was ‘extremely’ hard ‘to diagnose’.') + ) + + def test_quote_apostrophe3(self): + """Test the word tokenizer when a quote is used as an apostrophe.""" + self.assertEqual( + ['Alzheimer', '\u2019s', 'disease', '(', 'AD', ')', 'was', '‘', 'extremely', '’', 'hard', '‘', 'to', 'diagnose', '’', ',', 'at', 'the', 'time', '.'], + self.t.tokenize('Alzheimer’s disease (AD) was ‘extremely’ hard ‘to diagnose’, at the time.') + ) + + def test_apostrophe_quote(self): + """Test the word tokenizer when an apostrophe is used as a quote.""" + self.assertEqual( + ["a", "Special", "Issue", "entitled", "'", "Cognitive", "Enhancers", "'", "."], + self.t.tokenize("a Special Issue entitled 'Cognitive Enhancers'.") + ) + + def test_double_apostrophe_quote(self): + """Test the word tokenizer when an apostrophe is used as a quote.""" + self.assertEqual( + ["between", "the", "''", "Schiff", "base", "''", "and"], + self.t.tokenize("between the ''Schiff base'' and") + ) + + def test_bracketed_url(self): + """Test the word tokenizer on a bracketed URL.""" + self.assertEqual( + ['QALIBRA', 'software', '(', 'www.qalibra.eu', ')'], + self.t.tokenize('QALIBRA software (www.qalibra.eu)') + ) + + def test_symbols(self): + self.assertEqual(['2', '+', '2', '=', '4'], self.t.tokenize('2+2=4')) + self.assertEqual(['+', '4', '°C'], self.t.tokenize('+4°C')) + self.assertEqual(['(', 'H2A', '<-->', 'HA-', '+', 'H+', 'and', 'HA-', '<-->', 'A', '=', '+', 'H+', ')'], self.t.tokenize('(H2A <--> HA- + H+ and HA- <--> A= + H+)')) + self.assertEqual(['[2+2+2]'], self.t.tokenize('[2+2+2]')) + self.assertEqual(['95.5', '+/-', '0.2', '%'], self.t.tokenize('95.5 +/- 0.2%')) + + def test_sentence_end(self): + self.assertEqual(['upon', 'addition', 'of', 'Ni(II)', ';'], self.t.tokenize('upon addition of Ni(II);')) + self.assertEqual(['upon', 'addition', 'of', 'Ni(II)', '.'], self.t.tokenize('upon addition of Ni(II).')) + self.assertEqual(['complexes', 'in', 'THF', '(', 'ii', ')', '.'], self.t.tokenize('complexes in THF (ii).')) + self.assertEqual(['complexes', 'in', 'THF', '(', 'ii', ')', ','], self.t.tokenize('complexes in THF (ii),')) + self.assertEqual(['measured', 'at', '303', 'K', '.'], self.t.tokenize('measured at 303 K.')) + self.assertEqual(['Sentence', 'trails', 'off', '…'], self.t.tokenize('Sentence trails off…')) + self.assertEqual(['Sentence', 'trails', 'off', '...'], self.t.tokenize('Sentence trails off...')) + self.assertEqual(['in', 'the', 'AUC', '.'], self.t.tokenize('in the AUC.')) + self.assertEqual(['for', 'lane', 'no.', '11', '.'], self.t.tokenize('for lane no. 11.')) + self.assertEqual(['under', 'A.', 'M.', '1.5', 'illumination'], self.t.tokenize('under A. M. 1.5 illumination')) + self.assertEqual(['space', 'group', 'P', '(', 'No.', '2', ')', '.'], self.t.tokenize('space group P (No. 2).')) + + def test_abbreviations(self): + self.assertEqual(['(', 'ca.', '30', 'mL', ')'], self.t.tokenize('(ca. 30 mL)')) + self.assertEqual(['Elements', ',', 'e.g.', 'calcium'], self.t.tokenize('Elements, e.g. calcium')) + + def test_more_brackets(self): + self.assertEqual(['NaOH', '(', 'aq', ')'], self.t.tokenize('NaOH(aq)')) + self.assertEqual(['HCl', '(', 'g', ')'], self.t.tokenize('HCl(g)')) + self.assertEqual(['5(g)'], self.t.tokenize('5(g)')) + self.assertEqual(['a', ')', 'UV', '/', 'vis', 'spectrum', '.'], self.t.tokenize('a) UV/vis spectrum.')) + self.assertEqual(['a', ')', 'UV', '-', 'vis', 'spectrum', '.'], self.t.tokenize('a) UV-vis spectrum.')) + self.assertEqual(['(', 'c', ')', '–', '(', 'e', ')'], self.t.tokenize('(c)–(e)')) + self.assertEqual(['THF', '(', 'i', ')', ',', 'toluene', '(', 'iii', ')'], self.t.tokenize('THF (i), toluene (iii)')) + self.assertEqual(['buffer', '(', 'pH', '7.4', ')', '.'], self.t.tokenize('buffer (pH 7.4).')) + + def test_multihyphens(self): + self.assertEqual(['---'], self.t.tokenize('---')) + self.assertEqual(['–––'], self.t.tokenize('–––')) + self.assertEqual(['———'], self.t.tokenize('———')) + self.assertEqual(['−−−'], self.t.tokenize('−−−')) + self.assertEqual(['--'], self.t.tokenize('--')) + self.assertEqual(['––'], self.t.tokenize('––')) + self.assertEqual(['——'], self.t.tokenize('——')) + self.assertEqual(['−−'], self.t.tokenize('−−')) + + def test_tilde(self): + self.assertEqual(['a', 'line', 'width', 'of', '\u223c', '3', 'Hz', '.'], self.t.tokenize('a line width of ∼3 Hz.')) + self.assertEqual(['a', 'line', 'width', 'of', '~', '3', 'Hz', '.'], self.t.tokenize('a line width of ~3 Hz.')) + + def test_slashes(self): + self.assertEqual(['methanol', '/', 'water'], self.t.tokenize('methanol/water')) + self.assertEqual(['B3LYP', '/', '6-311G(d,p)'], self.t.tokenize('B3LYP/6-311G(d,p)')) + + def test_iron_states(self): + self.assertEqual(['Fe(III)'], self.t.tokenize('Fe(III)')) + self.assertEqual(['Fe(iii)'], self.t.tokenize('Fe(iii)')) + self.assertEqual(['Fe(3+)'], self.t.tokenize('Fe(3+)')) + self.assertEqual(['Fe(0)'], self.t.tokenize('Fe(0)')) + + def test_identifiers(self): + self.assertEqual(['4CN'], self.t.tokenize('4CN')) + self.assertEqual(['2a'], self.t.tokenize('2a')) + + def test_colons(self): + self.assertEqual(['ethanol', ':', 'water'], self.t.tokenize('ethanol:water')) + self.assertEqual(['1', ':', '2'], self.t.tokenize('1:2')) + self.assertEqual(['1', ':', '2'], self.t.tokenize('1 : 2')) + self.assertEqual(['(', 'foo', ')', ':', '(', 'bar', ')'], self.t.tokenize('(foo):(bar)')) + self.assertEqual(['foo', ')', ':', '(', 'bar'], self.t.tokenize('foo):(bar')) + self.assertEqual(['4:7,10:13-diepoxy[15]annulenone'], self.t.tokenize('4:7,10:13-diepoxy[15]annulenone')) + self.assertEqual(['9-(5′,5-diphenyl[1,1′:3′,1′′:3′′,1:3,1′′′′-quinquephenyl]-5′′-diyl)-9H-carbazole'], self.t.tokenize('9-(5′,5-diphenyl[1,1′:3′,1′′:3′′,1:3,1′′′′-quinquephenyl]-5′′-diyl)-9H-carbazole')) + self.assertEqual(['9,9′-(5′-phenyl[1,1′:3′,1′′-terphenyl]-3,5-diyl)bis-9H-carbazole'], self.t.tokenize('9,9′-(5′-phenyl[1,1′:3′,1′′-terphenyl]-3,5-diyl)bis-9H-carbazole')) + + def test_lambda(self): + self.assertEqual(['lambda5-phosphane'], self.t.tokenize('lambda5-phosphane')) + self.assertEqual(['λ5-phosphane'], self.t.tokenize('λ5-phosphane')) + + def test_chem_names(self): + self.assertEqual(['Tetrahydro', 'furan', '(', 'THF', ')'], self.t.tokenize('Tetrahydro furan (THF)')) + self.assertEqual(['(S)-alanine'], self.t.tokenize('(S)-alanine')) + self.assertEqual(['D-glucose'], self.t.tokenize('D-glucose')) + self.assertEqual(['spiro[4.5]decane'], self.t.tokenize('spiro[4.5]decane')) + self.assertEqual(['β-D-Glucose'], self.t.tokenize('β-D-Glucose')) + self.assertEqual(['L-alanyl-L-glutaminyl-L-arginyl-O-phosphono-L-seryl-L-alanyl-L-proline'], + self.t.tokenize('L-alanyl-L-glutaminyl-L-arginyl-O-phosphono-L-seryl-L-alanyl-L-proline')) + self.assertEqual(['aluminium(3+)'], self.t.tokenize('aluminium(3+)')) + self.assertEqual(['1-methyl-2-methylidene-cyclohexane'], + self.t.tokenize('1-methyl-2-methylidene-cyclohexane')) + + def test_rings(self): + self.assertEqual(["2,2':6',2''-Terphenyl-1,1',1''-triol"], + self.t.tokenize("2,2':6',2''-Terphenyl-1,1',1''-triol")) + self.assertEqual(["phenothiazino[3',4':5,6][1,4]oxazino[2,3-i]benzo[5,6][1,4]thiazino[3,2-c]phenoxazine"], + self.t.tokenize("phenothiazino[3',4':5,6][1,4]oxazino[2,3-i]benzo[5,6][1,4]thiazino[3,2-c]phenoxazine")) + + def test_saccharide(self): + self.assertEqual(['beta-D-Glucopyranosyl-(1->4)-D-glucose'], self.t.tokenize('beta-D-Glucopyranosyl-(1->4)-D-glucose')) + self.assertEqual(['α-D-Glucopyranosyl-(1→4)-β-D-glucopyranose'], self.t.tokenize('α-D-Glucopyranosyl-(1→4)-β-D-glucopyranose')) + self.assertEqual(['α-L-Fucp-(1→3)-[α-D-Galp-(1→4)]-α-D-Glcp-(1→3)-α-D-GalpOAll'], self.t.tokenize('α-L-Fucp-(1→3)-[α-D-Galp-(1→4)]-α-D-Glcp-(1→3)-α-D-GalpOAll')) + self.assertEqual(['(1→4)-β-D-Glucan'], self.t.tokenize('(1→4)-β-D-Glucan')) + self.assertEqual(['((1→2)-α-D-galacto)-(1→4)-β-D-Glucan'], self.t.tokenize('((1→2)-α-D-galacto)-(1→4)-β-D-Glucan')) + + def test_polymer(self): + self.assertEqual([u"poly(2,2'-diamino-5-hexadecylbiphenyl-3,3'-diyl)"], + self.t.tokenize(u"poly(2,2'-diamino-5-hexadecylbiphenyl-3,3'-diyl)")) + + def test_operators(self): + self.assertEqual(['J', '=', '8.8'], self.t.tokenize('J=8.8')) + self.assertEqual(['CH2', '=', 'CH2'], self.t.tokenize('CH2=CH2')) + self.assertEqual(['mL', '×', '3'], self.t.tokenize('mL×3')) + self.assertEqual(['3', '×'], self.t.tokenize('3×')) + self.assertEqual(['×', '3'], self.t.tokenize('×3')) + self.assertEqual(['15', '÷', '3'], self.t.tokenize('15÷3')) + self.assertEqual(['5', '+', '3'], self.t.tokenize('5+3')) + self.assertEqual(['ESI+'], self.t.tokenize('ESI+')) + self.assertEqual(['Ce3+'], self.t.tokenize('Ce3+')) + + def test_stereo(self): + self.assertEqual(['(+)-chiraline'], self.t.tokenize('(+)-chiraline')) + self.assertEqual(['(-)-chiraline'], self.t.tokenize('(-)-chiraline')) + self.assertEqual(['(+-)-chiraline'], self.t.tokenize('(+-)-chiraline')) # \u002d Hyphen-minus + self.assertEqual(['(+−)-chiraline'], self.t.tokenize('(+−)-chiraline')) # \u2212 Minus + self.assertEqual(['(+/-)-chiraline'], self.t.tokenize('(+/-)-chiraline')) # \u002d Hyphen-minus + self.assertEqual(['(+/−)-chiraline'], self.t.tokenize('(+/−)-chiraline')) # \u2212 Minus + self.assertEqual(['(±)-chiraline'], self.t.tokenize('(±)-chiraline')) + + def test_hyphen_twice(self): + self.assertEqual(['cytoplasmic', '-', 'to', '-', 'nuclear'], self.t.tokenize('cytoplasmic-to-nuclear')) + self.assertEqual(['layer', '-', 'by', '-', 'layer'], self.t.tokenize('layer-by-layer')) + self.assertEqual(['end', '-', 'of', '-', 'phase'], self.t.tokenize('end-of-phase')) + self.assertEqual(['oil', '-', 'in', '-', 'water'], self.t.tokenize('oil-in-water')) + self.assertEqual(['nucleation', '-', 'and', '-', 'growth'], self.t.tokenize('nucleation-and-growth')) + self.assertEqual(['State', '-', 'of', '-', 'the', '-', 'art'], self.t.tokenize('State-of-the-art')) + + def test_nmr_types(self): + self.assertEqual(['(13)C', '-', 'NMR'], self.t.tokenize('(13)C-NMR')) + self.assertEqual(['(1)H', '-', 'NMR'], self.t.tokenize('(1)H-NMR')) + self.assertEqual(['(31)P', '-', 'NMR'], self.t.tokenize('(31)P-NMR')) + self.assertEqual(['(19)F', '-', 'NMR'], self.t.tokenize('(19)F-NMR')) + self.assertEqual(['13C', '-', 'NMR'], self.t.tokenize('13C-NMR')) + self.assertEqual(['1H', '-', 'NMR'], self.t.tokenize('1H-NMR')) + self.assertEqual(['31P', '-', 'NMR'], self.t.tokenize('31P-NMR')) + self.assertEqual(['19F', '-', 'NMR'], self.t.tokenize('19F-NMR')) + self.assertEqual(['proton', '-', 'NMR'], self.t.tokenize('proton-NMR')) + + def test_bracket_hyphen(self): + self.assertEqual(['(LBD)-linked'], self.t.tokenize('(LBD)-linked')) + self.assertEqual(['Fe(IV)', '-', 'oxo-mediated'], self.t.tokenize('Fe(IV)-oxo-mediated')) + self.assertEqual(['Fe(IV)', '-', 'mediated'], self.t.tokenize('Fe(IV)-mediated')) + self.assertEqual(['T-bet(-/-)'], self.t.tokenize('T-bet(-/-)')) + self.assertEqual(['(', 'nano', 'LC', '/', 'nano-ESI-IT-MS', ')'], self.t.tokenize('(nano LC/nano-ESI-IT-MS)')) + + def test_hyphen_nosplit(self): + self.assertEqual(['1,4-addition'], self.t.tokenize('1,4-addition')) + self.assertEqual(['1,3\'-substituted'], self.t.tokenize('1,3\'-substituted')) + self.assertEqual(['3,3′-dianisyl-substituted'], self.t.tokenize('3,3′-dianisyl-substituted')) + self.assertEqual(['α-substituted'], self.t.tokenize('α-substituted')) + self.assertEqual(['meta-substituted'], self.t.tokenize('meta-substituted')) + self.assertEqual(['poly-zwitterion'], self.t.tokenize('poly-zwitterion')) + self.assertEqual(['1,2-zwitterion'], self.t.tokenize('1,2-zwitterion')) + + def test_element_hyphen(self): + self.assertEqual(['Fe', '-', 'containing'], self.t.tokenize('Fe-containing')) + self.assertEqual(['C', '-', 'terminal'], self.t.tokenize('C-terminal')) + self.assertEqual(['Li', '-', 'ions'], self.t.tokenize('Li-ions')) + + def test_hyphen_split(self): + self.assertEqual(['hydrocarbon', '-', 'based'], self.t.tokenize('hydrocarbon-based')) + self.assertEqual(['methicillin', '-', 'resistant'], self.t.tokenize('methicillin-resistant')) + self.assertEqual(['methicillin', '-', 'resistant', ','], self.t.tokenize('methicillin-resistant,')) + self.assertEqual(['HPMA', '-', 'based'], self.t.tokenize('HPMA-based')) + self.assertEqual(['HPMA', '-', 'based', ')'], self.t.tokenize('HPMA-based)')) + + def test_bracket_chem2(self): + """Test the word tokenizer on chemical name containing brackets.""" + self.assertEqual( + ['(-)-(5R,8S,8aS)-8-methyl-5-pentyloctahydroindolizine', '(', '8-epi-indolizidine', '209B', ')', '9', 'in', '74', '%', 'yield', '.'], + self.t.tokenize('(-)-(5R,8S,8aS)-8-methyl-5-pentyloctahydroindolizine (8-epi-indolizidine 209B) 9 in 74% yield.') + ) + + def test_bracket_chem_identifier(self): + """Test the word tokenizer on chemical identifier with stereo brackets.""" + self.assertEqual( + ['produced', 'the', 'thiolactam', '(+)-27', 'in', '92', '%', 'yield', '.'], + self.t.tokenize('produced the thiolactam (+)-27 in 92% yield.') + ) + + def test_minus_hyphen(self): + """Test the word tokenizer on a minus used as a hyphen.""" + self.assertEqual( + ['The', 'dose', '−', 'response', 'curve', '.'], + self.t.tokenize('The dose−response curve.') + ) + + def test_abbreviation_sentence_end(self): + """Test the word tokenizer on sentence with abbreviation at the end.""" + self.assertEqual(['Chemical', 'Company', 'Ltd.'], self.t.tokenize('Chemical Company Ltd.')) + self.assertEqual(['Studies', 'in', 'the', 'U.S.'], self.t.tokenize('Studies in the U.S.')) + self.assertEqual(['the', 'mean', '±', 'S.D.'], self.t.tokenize('the mean ± S.D.')) + self.assertEqual(['in', 'a', 'beaker', 'at', 'r.t.'], self.t.tokenize('in a beaker at r.t.')) + self.assertEqual(['Whitaker', 'et', 'al.'], self.t.tokenize('Whitaker et al.')) + + def test_trademarks(self): + self.assertEqual(['CML', '(', 'TM', ')'], self.t.tokenize('CML(TM)')) + self.assertEqual(['Apple', '(', 'R', ')'], self.t.tokenize('Apple(R)')) + self.assertEqual(['IR3535', '(', '®', ')'], self.t.tokenize('IR3535(®)')) + self.assertEqual(['IR3535', '(', '™', ')'], self.t.tokenize('IR3535(™)')) + self.assertEqual(['IR3535', '(', 'TM', ')'], self.t.tokenize('IR3535(TM)')) + self.assertEqual(['IR3535', '(', 'R', ')'], self.t.tokenize('IR3535(R)')) + self.assertEqual(['IR3535', '®'], self.t.tokenize('IR3535®')) + self.assertEqual(['IR3535', '™'], self.t.tokenize('IR3535™')) + + def test_ms(self): + self.assertEqual(['[M+H]+', '1523.86', ',', '[M+2H]2+', '762.43', ',', '[M+3H]3+', '508.62.', 'Observed', ':', '[M+H]+', '1523.20', ',', '[M+2H]2+', '762.45', ',', '[M+3H]3+', '508.70', '.'], self.t.tokenize('[M+H]+ 1523.86, [M+2H]2+ 762.43, [M+3H]3+ 508.62. Observed: [M+H]+ 1523.20, [M+2H]2+ 762.45, [M+3H]3+ 508.70.')) + # This isn't ideal but can't see any alternative apart from super fine tokenization + self.assertEqual(['527.3596', '[', 'M', '+', 'H]+', ',', 'C30H47N4O4+'], self.t.tokenize('527.3596 [M + H]+, C30H47N4O4+')) + + def test_quantities(self): + self.assertEqual(['contamination', 'of', '2', '%', 'Cl2'], self.t.tokenize('contamination of 2% Cl2')) + self.assertEqual(['Placed', 'at', 'a', 'distance', 'of', '7.2', 'cm', '.'], self.t.tokenize('Placed at a distance of 7.2cm.')) + self.assertEqual(['Addition', 'of', '~', '1.8', 'mg', 'of', 'CaCO3', '.'], self.t.tokenize('Addition of ~1.8mg of CaCO3.')) + self.assertEqual(['Recorded', 'in', 'HCl', '(', 'pH', '2', ')', '.'], self.t.tokenize('Recorded in HCl (pH2).')) + self.assertEqual(['Experienced', 'a', 'pressure', 'of', '160', 'kPa', '.'], self.t.tokenize('Experienced a pressure of 160kPa.')) + self.assertEqual(['Brought', 'to', 'pH', '10.5', ',', 'gradually', '.'], self.t.tokenize('Brought to pH10.5, gradually.')) + self.assertEqual(['A', 'volume', 'of', '24', 'cm3', 'was', 'drained', '.'], self.t.tokenize('A volume of 24cm3 was drained.')) + self.assertEqual(['2', 'M', 'H2SO4', 'was', 'heated', '.'], self.t.tokenize('2M H2SO4 was heated.')) + self.assertEqual(['The', 'spectrum', 'was', 'recorded', 'at', '10', '°', 'C'], self.t.tokenize('The spectrum was recorded at 10° C')) + self.assertEqual(['The', 'spectrum', 'was', 'recorded', 'at', '10', '°C'], self.t.tokenize('The spectrum was recorded at 10°C')) + self.assertEqual(['The', 'spectrum', 'was', 'recorded', 'at', '10', '°C'], self.t.tokenize('The spectrum was recorded at 10 °C')) + self.assertEqual(['Added', '3.5', 'g', 'and', 'stirred', 'for', '5.5', 's', '.'], self.t.tokenize('Added 3.5g and stirred for 5.5s.')) + self.assertEqual(['and', '≈', '90', '°'], self.t.tokenize('and ≈90°')) + self.assertEqual(['B3LYP', '/', '6-31g(d)'], self.t.tokenize('B3LYP/6-31g(d)')) + self.assertEqual(['N', '1s', 'spectra'], self.t.tokenize('N 1s spectra')) + self.assertEqual(['In', 'the', '1980s', 'there', 'was'], self.t.tokenize('In the 1980s there was')) + self.assertEqual(['Produced', 'compounds', '3g', ',', '3l', ',', '3m', 'and', '3n'], self.t.tokenize('Produced compounds 3g, 3l, 3m and 3n')) + self.assertEqual(['9.66', '(', 'd', ',', '1H', ',', '3J', '=', '5.4', 'Hz', ',', 'H15', ')'], self.t.tokenize('9.66 (d, 1H, 3J = 5.4Hz, H15)')) + self.assertEqual(['greater', 'than', '3', '×', '10-11', 'mol', 'kg-1', 'or', '2', '×', '10-5', 'mol', 'kg-1'], self.t.tokenize('greater than 3 × 10-11mol kg-1 or 2 × 10-5mol kg-1')) + + def test_linesymbols(self): + self.assertEqual(['N', '(', '■', ')', ',', 'C2', '(', '●', ')', ',', 'C3', '(', '▲', ')'], + self.t.tokenize('N(■), C2(●), C3(▲)')) + self.assertEqual(['benzaldehyde', '(', '○', ')'], self.t.tokenize('benzaldehyde (○)')) + self.assertEqual(['6', '(', '--', ')', ',', '1', '(', '----', ')', 'and', '3', '(', '·····', ')'], + self.t.tokenize('6 (--), 1 (----) and 3 (·····)')) + self.assertEqual(['6', '(', '--', ')', ',', '1', '(', '----', ')', 'and', '3', '(', '·····', ')'], + self.t.tokenize('6 (--), 1 (----) and 3 (·····)')) + + def test_bracket_chems(self): + self.assertEqual(['molecules', 'of', 'the', '[NiL2]', 'complex'], self.t.tokenize('molecules of the [NiL2] complex')) + self.assertEqual(['[Et3NBz][FeIIICl4]'], self.t.tokenize('[Et3NBz][FeIIICl4]')) + self.assertEqual(['[2PA-Mmim][Tf2N]'], self.t.tokenize('[2PA-Mmim][Tf2N]')) + self.assertEqual(['[H2O]', '≈', '3000', 'ppm'], self.t.tokenize('[H2O] ≈ 3000 ppm')) + self.assertEqual(['(', '[Cu+]', '/', '[L]', '=', '3', ')'], self.t.tokenize('([Cu+]/[L] = 3)')) + self.assertEqual(['(Ph3PO)(Ph3POH)(HSO4)'], self.t.tokenize('(Ph3PO)(Ph3POH)(HSO4)')) + self.assertEqual(['(', 'iron(III)'], self.t.tokenize('(iron(III)')) + + def test_chem_formula(self): + self.assertEqual(['(C2H5)4N'], self.t.tokenize('(C2H5)4N')) + self.assertEqual(['(C2H5)4N'], self.t.tokenize('(C2H5)4N')) + self.assertEqual(['monomer', '28M-Py2'], self.t.tokenize('monomer 28M-Py2')) + self.assertEqual(['monomer', '28M-Py2'], self.t.tokenize('monomer 28M-Py2')) + self.assertEqual(['ratio', 'Ag+', '/', 'nucleoside', 'of', '3', ':', '1'], self.t.tokenize('ratio Ag+/nucleoside of 3:1')) + self.assertEqual(['[Al(H2L)n]3-'], self.t.tokenize('[Al(H2L)n]3-')) + self.assertEqual(['[Fe(CN)5(NO)]2-'], self.t.tokenize('[Fe(CN)5(NO)]2-')) + self.assertEqual(['[Fe(CN)5(NO)]2−'], self.t.tokenize('[Fe(CN)5(NO)]2−')) + + def test_deuterated(self): + self.assertEqual(['acetone-d6'], self.t.tokenize('acetone-d6')) + self.assertEqual(['chloroform-d'], self.t.tokenize('chloroform-d')) + self.assertEqual(['d8-THF'], self.t.tokenize('d8-THF')) + self.assertEqual(['THF-d8'], self.t.tokenize('THF-d8')) + self.assertEqual(['d6-DMSO'], self.t.tokenize('d6-DMSO')) + self.assertEqual(['DMSO-d6'], self.t.tokenize('DMSO-d6')) + + def test_reagents_list(self): + """Test the word tokenizer on a reagents list.""" + self.assertEqual( + ['Reagents', ':', '(', 'i', ')', 'H2', '(', '7', 'atm', ')', ',', '10', '%', 'Pd', '/', 'C', ',', 'AcOH', ',', 'rt', ';', '(', 'ii', ')', 'Cl(CH2)3COCl', ',', 'NaOEt', '(', 'cat.', ')', ',', 'CHCl3', ',', 'reflux', ';', '(', 'iii', ')', 'Lawesson', '\'s', 'reagent', ',', '110', '°', 'C', ';'], + self.t.tokenize('Reagents: (i) H2 (7 atm), 10% Pd/C, AcOH, rt; (ii) Cl(CH2)3COCl, NaOEt (cat.), CHCl3, reflux; (iii) Lawesson\'s reagent, 110°C;') + ) + + def test_abbreviation_definition(self): + """Test the word tokenizer on chemical abbreviation definition.""" + self.assertEqual( + ['(', 'ADDP', ':', "1,1'-(azodicarbonyl)dipiperidine", ')'], + self.t.tokenize('(ADDP: 1,1\'-(azodicarbonyl)dipiperidine)') + ) + + def test_nmr_whitespace_error(self): + """Test the word tokenizer on NMR isotope missing preceding whitespace.""" + self.assertEqual(['726.1520', '.', '1H', 'NMR'], self.t.tokenize('726.1520.1H NMR')) + self.assertEqual(['intermediate', '.', '1H', u'NMR'], self.t.tokenize('intermediate.1H NMR')) + + def test_ir_whitespace_error(self): + """Test things like IR(KBr).""" + self.assertEqual(['IR', '(', 'KBr', ')'], self.t.tokenize('IR(KBr)')) + + def test_bracket_whitespace_error(self): + """Test the word tokenizer on bracket whitespace error.""" + self.assertEqual(['7.95', '(', 's', ',', '4H', ')'], self.t.tokenize('7.95(s, 4H)')) + self.assertEqual(['In', 'Fig.', '5', '(', 'a', ',', 'b', ')'], self.t.tokenize('In Fig. 5(a, b)')) + + def test_quote_colon(self): + """Test the word tokenizer quote followed by colon followed by digit (IndexError bugfix).""" + self.assertEqual(['\'', ':', '1'], self.t.tokenize('\':1')) + + def test_chemtext_sentence(self): + """Test tokenization through the Text and Sentence API.""" + t = Text('Hi, my name is Matt. What is your name?', word_tokenizer=self.t) + self.assertEqual( + [['Hi', ',', 'my', 'name', 'is', 'Matt', '.'], ['What', 'is', 'your', 'name', '?']], + [sent.raw_tokens for sent in t.sentences] + ) + + def test_chemtext_sentence2(self): + """Test tokenization through the ChemText and Sentence API.""" + t = Text('(Ka: 1.42×10(10) M-1 vs 1.95±0.35×10(10) M-1) and increased (9.9 vs 3.7±0.4 fmol)', word_tokenizer=self.t) + self.assertEqual( + [['(', 'Ka', ':', '1.42', '×', '10(10)', 'M-1', 'vs', '1.95', '±', '0.35', '×', '10(10)', 'M-1', ')', 'and', 'increased', '(', '9.9', 'vs', '3.7', '±', '0.4', 'fmol', ')']], + [sent.raw_tokens for sent in t.sentences] + ) + + +class TestFineWordTokenizer(unittest.TestCase): + """Test the fine word tokenizer.""" + + maxDiff = None + + def setUp(self): + self.t = FineWordTokenizer() + + def test_final_full_stop(self): + """Test the word tokenizer splits off final full stop only.""" + self.assertEqual( + ['This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.'], + self.t.tokenize('This is Mr. Hoppy\'s tortoise.') + ) + + def test_full_stop_following(self): + """Test the word tokenizer splits off final full stop if followed by brackets or quotes.""" + self.assertEqual( + ['(', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.', ')'], + self.t.tokenize('(This is Mr. Hoppy\'s tortoise.)') + ) + self.assertEqual( + ['"', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tortoise', '.', '"'], + self.t.tokenize('"This is Mr. Hoppy\'s tortoise."') + ) + self.assertEqual( + ['"', 'This', 'is', 'Mr.', 'Hoppy', '\'s', 'tort.oise', '.', '"'], + self.t.tokenize('"This is Mr. Hoppy\'s tort.oise."') + ) + + def test_dollar(self): + """Test the word tokenizer on dollar symbol.""" + self.assertEqual( + ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'], + self.t.tokenize('On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.') + ) + + def test_quote(self): + """Test the word tokenizer on quotes.""" + self.assertEqual( + ['"', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', '"', 'Slocum', 'said', '.'], + self.t.tokenize('"We beat some pretty good teams to get here," Slocum said.') + ) + + def test_brackets_quotes(self): + """Test the word tokenizer on brackets and quotes.""" + self.assertEqual( + ['Well', ',', 'we', 'could', 'n\'t', 'have', 'this', 'predictable', ',', 'cliche', '-', 'ridden', ',', '"', 'Touched', 'by', 'an', 'Angel', '"', '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wan', 'na', '-', 'be', 'if', 'she', 'did', 'n\'t', '.'], + self.t.tokenize('Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.') + ) + + def test_exclamation(self): + """Test the word tokenizer on exclamation mark.""" + self.assertEqual( + ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'], + self.t.tokenize('I cannot cannot work under these conditions!') + ) + + def test_digit_comma(self): + """Test the word tokenizer on commas within numbers.""" + self.assertEqual( + ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'], + self.t.tokenize('The company spent $30,000,000 last year.') + ) + + def test_percentage(self): + """Test the word tokenizer on percent sign.""" + self.assertEqual( + ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'], + self.t.tokenize('The company spent 40.75% of its income last year.') + ) + + def test_colon_time(self): + """Test the word tokenizer on colon between digits in a time.""" + self.assertEqual( + ['He', 'arrived', 'at', '3', ':', '00', 'pm', '.'], + self.t.tokenize('He arrived at 3:00 pm.') + ) + + def test_word_colon(self): + """Test the word tokenizer on colon after word.""" + self.assertEqual( + ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'], + self.t.tokenize('I bought these items: books, pencils, and pens.') + ) + + def test_digit_comma_space(self): + """Test the word tokenizer on comma between digits with a space.""" + self.assertEqual( + ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'], + self.t.tokenize('Though there were 150, 100 of them were old.') + ) + + def test_digit_comma_multiple(self): + """Test the word tokenizer on comma at end of digits.""" + self.assertEqual( + ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'], + self.t.tokenize('There were 300,000, but that wasn\'t enough.') + ) + + def test_theyll(self): + """Test the word tokenizer on the word they'll.""" + self.assertEqual( + ['They', "'ll", 'save', 'and', 'invest', 'more', '.'], + self.t.tokenize('They\'ll save and invest more.') + ) + + def test_bracket1(self): + """Test the word tokenizer on sentence containing brackets.""" + self.assertEqual( + ['For', 'a', 'few', 'weeks', '(', '>', '24', 'days', ')', '.'], + self.t.tokenize('For a few weeks (>24 days).') + ) + + def test_bracket_chems(self): + self.assertEqual(['molecules', 'of', 'the', '[', 'NiL2', ']', 'complex'], self.t.tokenize('molecules of the [NiL2] complex')) + self.assertEqual(['[', 'Et3NBz', ']', '[', 'FeIIICl4', ']'], self.t.tokenize('[Et3NBz][FeIIICl4]')) + self.assertEqual(['[', '2PA', '-', 'Mmim', ']', '[', 'Tf2N', ']'], self.t.tokenize('[2PA-Mmim][Tf2N]')) + self.assertEqual(['[', 'H2O', ']', '≈', '3000', 'ppm'], self.t.tokenize('[H2O] ≈ 3000 ppm')) + self.assertEqual(['(', '[', 'Cu', '+', ']', '/', '[', 'L', ']', '=', '3', ')'], self.t.tokenize('([Cu+]/[L] = 3)')) + self.assertEqual(['(', 'Ph3PO', ')', '(', 'Ph3POH', ')', '(', 'HSO4', ')'], self.t.tokenize('(Ph3PO)(Ph3POH)(HSO4)')) + self.assertEqual(['(', 'iron', '(', 'III', ')'], self.t.tokenize('(iron(III)')) + + def test_chem_formula(self): + self.assertEqual(['(', 'C2H5', ')', '4N'], self.t.tokenize('(C2H5)4N')) + self.assertEqual(['(', 'C2H5', ')', '4N'], self.t.tokenize('(C2H5)4N')) + self.assertEqual(['monomer', '28M', '-', 'Py2'], self.t.tokenize('monomer 28M-Py2')) + self.assertEqual(['monomer', '28M', '-', 'Py2'], self.t.tokenize('monomer 28M-Py2')) + self.assertEqual(['ratio', 'Ag', '+', '/', 'nucleoside', 'of', '3', ':', '1'], self.t.tokenize('ratio Ag+/nucleoside of 3:1')) + self.assertEqual(['[', 'Al', '(', 'H2L', ')', 'n', ']', '3', '-'], self.t.tokenize('[Al(H2L)n]3-')) + self.assertEqual(['[', 'Fe', '(', 'CN', ')', '5', '(', 'NO', ')', ']', '2', '-'], self.t.tokenize('[Fe(CN)5(NO)]2-')) + self.assertEqual(['[', 'Fe', '(', 'CN', ')', '5', '(', 'NO', ')', ']', '2', '−'], self.t.tokenize('[Fe(CN)5(NO)]2−')) + + def test_chem_names(self): + self.assertEqual(['Tetrahydro', 'furan', '(', 'THF', ')'], self.t.tokenize('Tetrahydro furan (THF)')) + self.assertEqual(['(', 'S', ')', '-', 'alanine'], self.t.tokenize('(S)-alanine')) + self.assertEqual(['D', '-', 'glucose'], self.t.tokenize('D-glucose')) + self.assertEqual(['spiro', '[', '4.5', ']', 'decane'], self.t.tokenize('spiro[4.5]decane')) + self.assertEqual(['β', '-', 'D', '-', 'Glucose'], self.t.tokenize('β-D-Glucose')) + self.assertEqual(['L', '-', 'alanyl', '-', 'L', '-', 'glutaminyl', '-', 'L', '-', 'arginyl', '-', 'O', '-', 'phosphono', '-', 'L', '-', 'seryl', '-', 'L', '-', 'alanyl', '-', 'L', '-', 'proline'], + self.t.tokenize('L-alanyl-L-glutaminyl-L-arginyl-O-phosphono-L-seryl-L-alanyl-L-proline')) + self.assertEqual(['aluminium', '(', '3', '+', ')'], self.t.tokenize('aluminium(3+)')) + self.assertEqual(['1', '-', 'methyl', '-', '2', '-', 'methylidene', '-', 'cyclohexane'], + self.t.tokenize('1-methyl-2-methylidene-cyclohexane')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_nlp_unitstokenizer.py b/tests/test_nlp_unitstokenizer.py new file mode 100644 index 0000000..d4a9b10 --- /dev/null +++ b/tests/test_nlp_unitstokenizer.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +import unittest + +import cathodedataextractor.nlp + + +class TestUnitsTokenizer(unittest.TestCase): + + def test_tokenizer4units(self): + tokenize = cathodedataextractor.nlp.units_tokenizer + + test = """ + ( ... 0.1 ... C ... ) ... within ... P2 ... - ... type ... Na0.67Mn0.7(Ni0.15Cu0.15)O2 ... micr ... + Na3Mn1-xCrxTi(PO4)3 ... and ... the ... mixture ... was ... heated ... up ... to ... 800 ... °C ... for + ... 24 ... h ... dried ... at ... 100 ... °C ... and ... then ... calcined ... at ... 400 ... °C ... for + ... 2 ... h ... under ... nitrogen ... atmosphere. ... dried ... in ... a ... 100 ... °C ... vacuum ... + oven ... for ... 12 ... h. ... for ... 4 ... h ... at ... 450 ... °C ... in ... an ... Ar ... atmosphere + ... to ... decompose ... the ... organic ... compound ... , ... and ... then ... for ... 7 ... h ... at + ... 700 ... , ... 750 ... and ... 800 ... °C ... to ... obtain ... the ... car ... 0.01 ... C ... 3.5 ... + - ... 4.5 ... V ... CE ... The ... intrest ... calculation ... results ... show ... that ... borophene ... + exhibits ... a ... superhigh ... specific ... capacity ... ( ... 1 ... , ... 218 ... mAhg-1 ... ) ... 5 + ... Cto10 ... C ... respectively ... , ... 11 ... cyclesat ... rates ... of ... 0.1 ... , ... 0.2 ... , + ... 0.5 ... , ... 1 ... , ... 2 ... , ... 5 ... to ... 10 ... C ... , ... to ... be ... 149 ... , ... 139 + ... , ... 129 ... , ... 118 ... , ... 109 ... , ... 101 ... , ... 93 ... , ... 78 ... , ... and ... ( ... + 64 ... mAhg-1 ... ) ... at ... 0.05 ... , ... 0.1 ... , ... 0.2 ... , ... 0.3 ... , ... 0.4 ... , ... 0.5 + ... , ... 1 ... , ... 2 ... , ... and ... 3 ... Ag-1 ... and ... 2 ... when ... it ... returned ... to ... + 0.4 ... Ag-1 ... , ... the ... capacity ... of ... NCMO-1 ... also ... returned ... to ... 103 ... mAhg-1 + ... , ... indicating ... the ... excellent ... reversibility ... of ... the ... material. ... It ... is + ... worth ... noting ... that ... even ... at ... a ... current ... density ... of ... 3 ... Ag-1 ... , + ... NCMO-1 ... still ... exhibited ... a ... reversible ... specific ... capacity ... of ... 64 ... mAhg-1 + ... ; ... profiles ... ( ... 1 ... st ... , ... 25 ... , ... 50 ... , ... 75 ... , ... 100 ... , ... 125 + ... , ... 150 ... , ... 200 ... , ... 250 ... , ... 300 ... , ... 350 ... , ... 400 ... , ... 450 ... , + ... and1 ... , ... 2 ... nd ... , ... 5 ... th ... , ... 10 ... th ... , ... 15 ... th ... , ... 30 ... th + ... , ... 50 ... th ... , ... 70 ... th ... , ... and ... 100 ... cycles ... ( ... charge ... states ... ) + ... ar + """ + + test_res = tokenize.tokenize( + '(0.1C) within P2-type Na0.67Mn0.7(Ni0.15Cu0.15)O2 micr Na3Mn1-xCrxTi(PO4)3 and the mixture was heated ' + 'up to 800 °C for 24h dried at 100°C and then calcined at 400°C for 2h under nitrogen ' + 'atmosphere. dried in a 100 °C vacuum oven for 12h. for 4h at 450°C in an Ar ' + 'atmosphere to decompose the organic compound, and then for 7h at 700, 750 and 800°C to ' + 'obtain the car 0.01C 3.5-4.5V CE The intrest calculation results show ' + 'that borophene exhibits a superhigh specific capacity ' + '(1,218mAhg-1) 5Cto10C respectively, 11cyclesat rates ' + 'of 0.1, 0.2, 0.5, 1, 2, 5to10C, to be 149 , 139 , 129 , ' + '118,109 , 101 , 93 , 78 , and (64 mAhg-1) at 0.05 , ' + '0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 1 , 2 , and 3Ag-1and2 when ' + 'it returned to 0.4Ag-1, the capacity of NCMO-1 also ' + 'returned to 103 mAhg-1 , indicating the excellent ' + 'reversibility of the material. It is worth noting that ' + 'even at a current density of 3Ag-1, NCMO-1 still exhibited ' + 'a reversible specific capacity of 64mAhg-1;' + ' profiles (1st, 25, 50, 75, 100, 125, 150, 200, 250, 300, 350, 400, 450, and' + '1,2nd,5th,10th,15th,30th,50th,70th, and 100 cycles (charge states) ar') + self.assertEqual(list(map(lambda x: x.strip(), test.split(" ... "))), test_res) diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..352a941 --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +import unittest +from cathodedataextractor.parse import end_parentheses, relation_extraction + + +class TestParse(unittest.TestCase): + + def test_end_parentheses(self): + cems = [ + ('Na3Mn1-xCrxTi(PO4)3 (x=0.01, 0.03, 0.05, 0.07, 0.10, 0.12, 0.15)', + (20, '(x=0.01, 0.03, 0.05, 0.07, 0.10, 0.12, 0.15)')), + ('Nax(Cu-Fe-Mn)O2', (None, None)), + ('NaFexCr1-X(SO4)2', (None, None)), + ('Na0.6Li0.6(Mn0.72Ni0.18Co0.10)O2', (None, None)), + ('P2-Na0.59Mn0.85Co0.1(Ti2V)0.05O2', (None, None)), + ('Na3Fe2PO4(P2O7', (None, None)), + ('Na0.71Co1-xZnxO2(0 ≤ x ≤ 0.02)', (16, "(0 ≤ x ≤ 0.02)")), + ( + 'Na0.71Co1-xZnxO2(0 ≤ x ≤ 0.02, ... sdf()sff (...) sfsf)', + (16, "(0 ≤ x ≤ 0.02, ... sdf()sff (...) sfsf)")), + ('Na0.67Ni0.28Mn0.67Y0.05O2 (NMY-5', (26, "(NMY-5")), + ('Na0.67Ni0.33Mn0.67O2 (NNM)', (21, "(NNM)")), + ('O3-NaBxMn1-xO2 (x=0, 0.05, 0.1, 0.15, 0.25) oxides', (15, "(x=0, 0.05, 0.1, 0.15, 0.25) oxides")), + ('NaFexCr1-X(SO4)2 ((dsfjs)dsfd', (17, "((dsfjs)dsfd")), + ('Na0.66Li0.18Mn0.71Co0.08Ni0.21O2()', (32, "()")), + ] + for cem in cems: + self.assertEqual(end_parentheses(cem[0]), cem[1]) + + def test_PropertyParse(self): + pp = relation_extraction.PropertyParse() + self.assertEqual(pp.parse_stoichiometric_variables('Na0.8(Li0.33Mn0.67-xTix)O2', + 'XPS spectra of Mn 2p and Ti 2p for Na0.8(Li0.33Mn0.67-xTix)O2 ' + 'electrodes are used to validate this speculation. The ratio of ' + 'Mn3+ to Mn4+ in the x=0 and x=0.1 electrodes was 0.48:0.52 and ' + '0.23:0.77 calculated by fitting Mn 2p spectra, respectively [47,48]. '), + ('Na0.8(Li0.33Mn0.67-xTix)O2 (x=0 and x=0.1 )', 'x')) diff --git a/tests/test_text2chem.py b/tests/test_text2chem.py new file mode 100644 index 0000000..2883558 --- /dev/null +++ b/tests/test_text2chem.py @@ -0,0 +1,81 @@ +# coding=utf-8 +import json +import os +import unittest +from tests.resources import TEST_PATH +from text2chem.regex_parser import RegExParser +from text2chem.parser_pipeline import ParserPipelineBuilder +from text2chem.preprocessing_tools.additives_processing import AdditivesProcessing +from text2chem.preprocessing_tools.chemical_name_processing import ChemicalNameProcessing +from text2chem.preprocessing_tools.mixture_processing import MixtureProcessing +from text2chem.preprocessing_tools.phase_processing import PhaseProcessing +from text2chem.postprocessing_tools.substitute_additives import SubstituteAdditives + + +mp = ParserPipelineBuilder() \ + .add_preprocessing(AdditivesProcessing) \ + .add_preprocessing(ChemicalNameProcessing) \ + .add_preprocessing(PhaseProcessing) \ + .add_preprocessing(MixtureProcessing)\ + .add_postprocessing(SubstituteAdditives)\ + .set_regex_parser(RegExParser)\ + .build() + + +class TestText2chem(unittest.TestCase): + + @staticmethod + def return_data(testdata): + for idx, data in enumerate(testdata): + chem_name = data["material"] + output = data["parser_output"][0] + result = mp.parse(chem_name).to_dict() + yield output, result + + def test_formulas(self): + """ + test formulas + """ + testdata = json.loads(open(os.path.join(TEST_PATH, "formulas.json"), encoding="utf8").read()) + for output, result in self.return_data(testdata): + self.assertEqual(output, result) + + def test_additives(self): + """ + test additives + """ + testdata = json.loads(open(os.path.join(TEST_PATH, "additives.json"), encoding="utf8").read()) + for output, result in self.return_data(testdata): + self.assertEqual(output, result) + + def test_chemical_names(self): + """ + test chemical names + """ + testdata = json.loads(open(os.path.join(TEST_PATH, "chemical_names.json"), encoding="utf8").read()) + for output, result in self.return_data(testdata): + self.assertEqual(output, result) + + def test_mixtures(self): + """ + test mixtures: alloys, solid solutions, composites + """ + testdata = json.loads(open(os.path.join(TEST_PATH, "mixtures.json"), encoding="utf8").read()) + for output, result in self.return_data(testdata): + self.assertEqual(output, result) + + def test_phases(self): + """ + test phases + """ + testdata = json.loads(open(os.path.join(TEST_PATH, "phases.json"), encoding="utf8").read()) + for output, result in self.return_data(testdata): + self.assertEqual(output, result) + + def test_all(self): + """ + comprehensive test + """ + testdata = json.loads(open(os.path.join(TEST_PATH, "comprehensive.json"), encoding="utf-8").read()) + for output, result in self.return_data(testdata): + self.assertEqual(output, result) diff --git a/tests/test_text2chem_parser.py b/tests/test_text2chem_parser.py new file mode 100644 index 0000000..ac83ac2 --- /dev/null +++ b/tests/test_text2chem_parser.py @@ -0,0 +1,56 @@ +# coding=utf-8 +import unittest +from collections import OrderedDict + +from text2chem.core.formula_parser import __parse_parentheses as parse_parentheses +from text2chem.regex_parser import RegExParser + +parser = RegExParser() + + +class TestParser(unittest.TestCase): + + def test_regex_parser_make_fraction_convertion_case1(self): + self.assertEqual(parser.make_fraction_convertion( + 'Na2/3Ni(3/10-x)MgxMn7/10O2'), 'Na2/3Ni3/10-xMgxMn7/10O2' + ) + + def test_regex_parser_make_fraction_convertion_case2(self): + self.assertEqual(parser.make_fraction_convertion( + '(Cu1/3Nb2/3)1/4Ti(3/4-x)ZrxO2'), '(Cu1/3Nb2/3)1/4Ti3/4-xZrxO2' + ) + + def test_regex_parser_make_fraction_convertion_case3(self): + self.assertEqual(parser.make_fraction_convertion( + '(Cu1/3Nb2/3)(1/4-y)Ti(3/4-x)Zr(x+y)O2'), '(Cu1/3Nb2/3)1/4-yTi3/4-xZrx+yO2' + ) + + def test_regex_parser_make_fraction_convertion_case4(self): + self.assertEqual(parser.make_fraction_convertion( + '(Cu1/3Nb2/3)(1/4-y-x)Ti(3/4-x)Zr(2x+y)O2'), '(Cu1/3Nb2/3)(1/4-y-x)Ti3/4-xZr2*x+yO2' + ) + + def test_core_formula_parser__parse_parentheses_case1(self): + formula_dict = OrderedDict() + formula_dict, _ = parse_parentheses('Ti(OCH(CH3)2)4', "1", formula_dict) + self.assertEqual(formula_dict, {'C': '(8)+(4)', 'H': '(24)+(4)', 'O': '4', 'Ti': '1'}) + + def test_core_formula_parser__parse_parentheses_case2(self): + formula_dict = OrderedDict() + formula_dict, _ = parse_parentheses('Na2/3Ni3/10-xMgxMn7/10O2', "1", formula_dict) + self.assertEqual(formula_dict, {'Na': '0.667', 'Ni': '0.3-x', 'Mg': 'x', 'Mn': '0.7', 'O': '2'}) + + def test_core_formula_parser__parse_parentheses_case3(self): + formula_dict = OrderedDict() + formula_dict, _ = parse_parentheses('(Cu1/3Nb2/3)1/4Ti3/4-xZrxO2', "1", formula_dict) + self.assertEqual(formula_dict, {'Cu': '0.083', 'Nb': '0.167', 'O': '2', 'Ti': '0.75-x', 'Zr': 'x'}) + + def test_core_formula_parser__parse_parentheses_case4(self): + formula_dict = OrderedDict() + formula_dict, _ = parse_parentheses('(Cu1/3Nb2/3)1/4-yTi3/4-xZr(x+y)O2', "1", formula_dict) + self.assertEqual(formula_dict, {'Cu': '0.0833-0.333*y', 'Nb': '0.167-0.667*y', 'Ti': '0.75-x', 'Zr': 'x+y', 'O': '2'}) + + def test_core_formula_parser__parse_parentheses_case5(self): + formula_dict = OrderedDict() + formula_dict, _ = parse_parentheses('(Cu1/3Nb2/3)(1/4-y-x)Ti3/4-xZr2*x+yO2', "1", formula_dict) + self.assertEqual(formula_dict, {'Cu': '0.0833-0.333*x-0.333*y', 'Nb': '0.167-0.667*x-0.667*y', 'Ti': '0.75-x', 'Zr': '2*x+y', 'O': '2'}) diff --git a/tests/test_text_process.py b/tests/test_text_process.py new file mode 100644 index 0000000..5e1d5e6 --- /dev/null +++ b/tests/test_text_process.py @@ -0,0 +1,270 @@ +# -*- coding: utf-8 -*- +import unittest +from cathodedataextractor.text import BatteriesTextProcessor + + +class TestBatteriesTextProcessor(unittest.TestCase): + + def test_func1(self): + texts = [ + ( # 0 + 'We synthesized P2-Na0.67+xNi0.33Mn0.67O2 (x = 2 %, 2.5 %, 3 %, 3.5 %, 4 %) ' + 'with different sodium contents by solid state method, and compared the ' + 'differences in material properties under different sodium contents to ' + 'determine the optimal sodium content.', + + 'We synthesized P2-Na0.67+xNi0.33Mn0.67O2 (x = 2 %, 2.5 %, 3 %, 3.5 %, 4 %) ' + 'with different sodium contents by solid state method, and compared the ' + 'differences in material properties under different sodium contents to ' + 'determine the optimal sodium content.' + ), + + ( # 1 + 'O3-type layered Na(Ni0.4Cu0.1Mn0.4Ti0.1)1-xLaxO2 (x = 0 , 0.001 , 0.003 , ' + '0.005 , termed as NMCT, NMCT-Lax , respectively), which is consistent with ' + 'the data of No. 1535056 (tetragonal Na3V2(PO4)2O2F, abbreviated as NVPOF) in ' + 'Inorganic Crystal Structure Database.', + + 'O3-type layered Na(Ni0.4Cu0.1Mn0.4Ti0.1)1-xLaxO2 (x = 0 , 0.001 , 0.003 , ' + '0.005 , termed as NMCT, NMCT-Lax , respectively), which is consistent with ' + 'the data of No. 1535056 (tetragonal Na3V2P2O10F, abbreviated as NVPOF) in ' + 'Inorganic Crystal Structure Database.' + ), + + ( # 2 + 'In this work, the obtained layered O3-type NaFe9/20Cr9/20Ti1/10O2 delivered ' + 'a high initial discharge capacity of 140.63 mAh g−1 compared to ' + 'Na0.66Fe1/3Cr1/3Ti1/3O2 cathode material, which delivered discharge capacity ' + 'of 135.5 mAh g−1[7]', + + 'In this work, the obtained layered O3-NaTi0.1Cr0.45Fe0.45O2 delivered a high ' + 'initial discharge capacity of 140.63 mAhg-1 compared to ' + 'Na0.66Ti0.33Cr0.33Fe0.33O2 cathode material, which delivered discharge ' + 'capacity of 135.5 mAhg-1[7]'), + + ( # 3 + 'The compositions of h-NM, h-NMC and h-NMC2 are identified as Na0.66MnO2, ' + 'Na0.65Mn0.9Cu0.1O2 and Na0.63Mn0.8Cu0.2O2, based on ICP-AES results (Table ' + 'S1).', + + 'The compositions of h-NM, h-NMC and h-NMC2 are identified as Na0.66MnO2, ' + 'Na0.65Mn0.9Cu0.1O2 and Na0.63Mn0.8Cu0.2O2, based on ICP-AES results (Table ' + 'S1).' + ), + + ( # 4 + 'An initial reversible capacity of 131.3 mAhg-1 at 0.1 C can been seen for ' + 'the P2-NMNCC electrode, which maintained a retention of 86.7 % after 200 ' + 'cycles with an average Coulombic efficiency of approximately 99 % . Whereas ' + 'the P2-NMNC electrode only had 71.9 % initial capacity retention after 200 ' + 'cycles at 0.1 C . and then drying in a vacuum at 80 °C . First, selenium ' + 'powder was dispersed into 5 mg mL-1 graphite oxide (GO) solution by ' + 'sonication for 10 hours, followed by a freeze-drying process of 48 hours.', + + 'An initial reversible capacity of 131.3 mAhg-1 at 0.1 C can been seen for ' + 'the P2-NMNCC electrode, which maintained a retention of 86.7 % after 200 ' + 'cycles with an average Coulombic efficiency of approximately 99 % . Whereas ' + 'the P2-NMNC electrode only had 71.9 % initial capacity retention after 200 ' + 'cycles at 0.1 C . and then drying in a vacuum at 80 °C . First, selenium ' + 'powder was dispersed into 5 mg mL-1 graphite oxide (GO) solution by ' + 'sonication for 10 hours, followed by a freeze-drying process of 48 hours.'), + + ( # 5 + 'is still over 99.5% after 10, 000 cycles which presents a high degree of the ' + 'electrochemical. Density functional theory calculations and ab initio ' + 'molecular dynamics simulations are performed to study the feasibility of ' + 'using borophene, a newly synthesized two-dimensional sheet of boron, as an ' + 'anode material for sodium-ion and sodium-O2 batteries. The theoretical ' + 'capacity of borophene is found to be as high as 1,218mAhg-1 (Na0.5B). ' + '(Li0.75B, 1,860mAhg-1),', + + 'is still over 99.5% after 10, 000 cycles which presents a high degree of the ' + 'electrochemical. Density functional theory calculations and ab initio ' + 'molecular dynamics simulations are performed to study the feasibility of ' + 'using borophene, a newly synthesized two-dimensional sheet of boron, as an ' + 'anode material for sodium-ion and sodium-O2 batteries. The theoretical ' + 'capacity of borophene is found to be as high as 1218mAhg-1 (Na0.5B). ' + '(Li0.75B, 1860mAhg-1),' + ), + + ( # 6 + 'The O3-type Na((Mn0.4Fe0.3Ni0.3)1-xTix)O2 (x = 0 and 0.1) powders were ' + 'synthesized using a simple solid-state method. First, stoichiometric amounts ' + 'of Na2CO3 (Sigma Aldrich), Mn2O3 (Sigma Aldrich), Fe2O3 (Sigma Aldrich), NiO ' + '(Sigma Aldrich), and TiO2 (Sigma Aldrich) powders were mixed using an agate ' + 'mortar.', + + 'The O3-Na((Mn0.4Fe0.3Ni0.3)1-xTix)O2 (x = 0 and 0.1) powders were ' + 'synthesized using a simple solid-state method. First, stoichiometric amounts ' + 'of Na2CO3 (Sigma Aldrich), Mn2O3 (Sigma Aldrich), Fe2O3 (Sigma Aldrich), NiO ' + '(Sigma Aldrich), and TiO2 (Sigma Aldrich) powders were mixed using an agate ' + 'mortar.' + ), + + ( # 7 + 'Na0.70Ni0.20Cu0.15Mn0.65O2 (NNCM) powder was synthesized through the sol-gel ' + 'route. Stoichiometric amounts of nickelacetatetetrahydrate, ' + 'Copper(II)nitratetrihydrate, manganese(II)acetatetetrahydrate, and ' + 'sodiumcarbonate were dissolved in distilled water and were then stirred for ' + '5 h. Appropriate amounts of citric acid & ethyleneglycol were added to the ' + 'solution. The mixed solution was further stirred for another 10 h followed ' + 'by heating at 100 °C to make a gel. The gel was dried at 150 °C and ground ' + 'using a pestle and mortar. The resulting powder was calcinated at 550 °C for ' + '12 h and ground again. Subsequently, the ground powder was calcined at ' + 'different temperatures (700 °C, 800 °C, 850 °C, and 950 °C) for 12 h in air ' + 'and cooled to room temperature in the same furnace.', + + 'Na0.7Mn0.65Ni0.2Cu0.15O2 (NNCM) powder was synthesized through the sol-gel ' + 'route. Stoichiometric amounts of nickelacetatetetrahydrate, ' + 'Coppernitratetrihydrate, manganeseacetatetetrahydrate, and sodiumcarbonate ' + 'were dissolved in distilled water and were then stirred for 5 h. Appropriate ' + 'amounts of citric acid & ethyleneglycol were added to the solution. The ' + 'mixed solution was further stirred for another 10 h followed by heating at ' + '100 °C to make a gel. The gel was dried at 150 °C and ground using a pestle ' + 'and mortar. The resulting powder was calcinated at 550 °C for 12 h and ' + 'ground again. Subsequently, the ground powder was calcined at different ' + 'temperatures (700 °C, 800 °C, 850 °C, and 950 °C) for 12 h in air and cooled ' + 'to room temperature in the same furnace.' + ), + + ( # 8 + 'Therefore, NaNi0.45Mn0.3Ti0.2Zr0.05O2 exhibits an initial reversible ' + 'capacity of 141.4\u202fmAh g−1 with a coulombic efficiency of 98.8% ' + 'and remarkable capacity retention of 70% after 200 cycles at 0.05C, ' + 'presenting better electrochemistry performance than the conventional ' + 'NaNi0.5Mn0.5O2.', + + 'Therefore, NaZr0.05Ti0.2Mn0.3Ni0.45O2 exhibits an initial reversible ' + 'capacity of 141.4mAhg-1 with a coulombic efficiency of 98.8% and remarkable ' + 'capacity retention of 70% after 200 cycles at 0.05C, presenting better ' + 'electrochemistry performance than the conventional NaMn0.5Ni0.5O2.' + ), + + ( # 9 + 'Conversely, Na||P2-NaMN shows severe capacity loss at −40\u2009°C and ' + '920\u2009mA\u2009g−1 (Fig. 4c) and a discharge capacity retention of 77.8% ' + 'after 215 cycles at RT (Supplementary Fig. 19). The rest of the composites, ' + 'Na3Mn1-xCrxTi(PO4)3 (x=0.01, 0.03, 0.05, 0.07, 0.10, 0.12, 0.15), are ' + 'denoted as 1%Cr-NMTP, 3%Cr-NMTP, 5%Cr-NMTP, 7%Cr-NMTP, 10%Cr-NMTP, ' + '12%Cr-NMTP, and 15%Cr-NMTP, respectively. At present, the most studied ' + 'candidate materials are Na3V2P3O12 and Na3V2P2O8F3 (NVPF) . The ' + 'schematic procedure for the synthesis of NaCo0.15Ni0.815Al0.035O2 via ' + 'hydrothermal method (named as NCA-Hydro) is shown in Fig. S1(a). ' + 'depicts XRD patterns for Na2MgNiTeO6 (NMNTO) and Na2MgZnTeO6 (NMZTO) ' + 'materials prepared via a solid-state reaction method and XRD pattern of ' + 'Na2Mg2TeO6 (NMTO) is also added for comparison. Pristine P2-Na0.67MnO2 (NMO) ' + 'and Mo-doped P2-Na0.67Mn1-xMoxO2 (x=0.01, 0.03 and 0.05, defined as NMMO-x ' + '(x=1, 3, 5)) were synthesized via a solid-state reaction, and pure P2 phase ' + 'is obtained for x≤0.05, as shown in . Pristine P2-Na0.67MnO2 (NMO) and ' + 'Mo-doped P2-Na0.67Mn1-xMoxO2 (x = 0.01, 0.03 and 0.05, defined as NMMO-x (x ' + '= 1, 3, 5)) were synthesized via a solid-state reaction, and pure P2 phase ' + 'is obtained for x ≤ 0.05, as shown in Fig. S1. All series of layer/tunnel ' + 'composite materials (Na0.60Mn1-x-yFexTiyO2, x = 0, 0.05, 0.1, y = 0, 0.05, ' + '0.1, namely Na0.6MnO2, Na0.6Mn0.95Fe0.05O2, Na0.6Mn0.9Fe0.1O2, ' + 'Na0.6Ti0.05Mn0.95O2, Na0.6Ti0.1Mn0.9O2, Na0.6Ti0.05Mn0.9Fe0.05O2 and ' + 'Na0.6Ti0.1Mn0.8Fe0.1O2, marked as MFT0, MF5, MF10, MT5, MT10, MFT5 and ' + 'MFT10) samples were synthesized by co-precipitation method and ' + 'high-temperature solid-state reaction. ', + + 'Conversely, Na||P2-NaMN shows severe capacity loss at -40°C and 920mAg-1 ' + '(Fig. 4c) and a discharge capacity retention of 77.8% after 215 cycles at RT ' + '(Supplementary Fig. 19). The rest of the composites, Na3Mn1-xCrxTi(PO4)3 ' + '(x=0.01, 0.03, 0.05, 0.07, 0.10, 0.12, 0.15), are denoted as 1%Cr-NMTP, ' + '3%Cr-NMTP, 5%Cr-NMTP, 7%Cr-NMTP, 10%Cr-NMTP, 12%Cr-NMTP, and 15%Cr-NMTP, ' + 'respectively. At present, the most studied candidate materials are ' + 'Na3V2P3O12 and Na3V2P2O8F3 (NVPF) . The schematic procedure for the ' + 'synthesis of NaCo0.15Ni0.81Al0.04O2 via hydrothermal method (named as ' + 'NCA-Hydro) is shown in Fig. S1(a). depicts XRD patterns for ' + 'Na0.67Mg0.33Ni0.33Te0.33O2 (NMNTO) and Na0.67Mg0.33Zn0.33Te0.33O2 (NMZTO) ' + 'materials prepared via a solid-state reaction method and XRD pattern of ' + 'Na0.67Mg0.67Te0.33O2 (NMTO) is also added for comparison. Pristine ' + 'P2-Na0.67MnO2 (NMO) and Mo-doped P2-Na0.67Mn1-xMoxO2 (x=0.01, 0.03 and 0.05, ' + 'defined as NMMO-x (x=1, 3, 5)) were synthesized via a solid-state reaction, ' + 'and pure P2 phase is obtained for x≤0.05, as shown in . Pristine ' + 'P2-Na0.67MnO2 (NMO) and Mo-doped P2-Na0.67Mn1-xMoxO2 (x = 0.01, 0.03 and ' + '0.05, defined as NMMO-x (x = 1, 3, 5)) were synthesized via a solid-state ' + 'reaction, and pure P2 phase is obtained for x ≤ 0.05, as shown in Fig. S1. ' + 'All series of layer/tunnel composite materials (Na0.60Mn1-x-yFexTiyO2, x = ' + '0, 0.05, 0.1, y = 0, 0.05, 0.1, namely Na0.6MnO2, Na0.6Mn0.95Fe0.05O2, ' + 'Na0.6Mn0.9Fe0.1O2, Na0.6Ti0.05Mn0.95O2, Na0.6Ti0.1Mn0.9O2, ' + 'Na0.6Ti0.05Mn0.9Fe0.05O2 and Na0.6Ti0.1Mn0.8Fe0.1O2, marked as MFT0, MF5, ' + 'MF10, MT5, MT10, MFT5 and MFT10) samples were synthesized by ' + 'co-precipitation method and high-temperature solid-state reaction.' + ), + + ( # 10 + 'The rate performance of NaFe0.5 Mg0.5 O2 different current density is 158, ' + '150, 141,133,121 mAh g−1 at 50, 100, 200, 400, 800\xa0mA /g respectively. ' + '(Na2/3Ni1/3Mn2/3O2, P2-NNMO) The Na3Mn1-xCrxTi(PO4)3 (x=0, 0.01, 0.03, 0.05, ' + '0.07, 0.10, 0.12, 0.15) cathode materials series were synthesized through a ' + 'feasible sol-gel method. A slight decrease in the capacity was confirmed in ' + 'the Ti-substituted Na[(Mn0.4Fe0.3Ni0.3)1−xTix]O2 (for x = 0, 167 mAh g−1; x ' + '= 0.1, 151 mAh g−1 at 24 mA g−1) Nominal Na0.6(Li0.2Mn0.8)O2 with the ' + 'layered P3 structure (s.g. R3m) showed XPS evidence of holes in the O-2p ' + 'bands on removal of Na+ ions. A large voltage plateau at 4.1 V versus Na+/Na ' + 'faded significantly over 50 cycles although the capacity in the range 20 ≤ V ' + '< 4.5 V remained unchanged. Oxidation of the O-2p bands is not reversible. ' + 'At a rate of 0.5C, NMCO_750 exhibited a high gravimetric capacity of 84\xa0' + 'mA\xa0h\xa0g−1, which was higher than NMCO_650 (80\xa0mA\xa0h\xa0g−1) and ' + 'NMCO_850 (77\xa0mA\xa0h\xa0g−1).', + + 'The rate performance of NaMg0.5Fe0.5O2 different current density is 158, ' + '150, 141,133,121 mAhg-1 at 50, 100, 200, 400, 800mAg-1 respectively. ' + '(Na0.67Mn0.67Ni0.33O2, P2-NNMO) The Na3Mn1-xCrxTi(PO4)3 (x=0, 0.01, 0.03, ' + '0.05, 0.07, 0.10, 0.12, 0.15) cathode materials series were synthesized ' + 'through a feasible sol-gel method. A slight decrease in the capacity was ' + 'confirmed in the Ti-substituted Na((Mn0.4Fe0.3Ni0.3)1-xTix)O2 (for x = 0, ' + '167 mAhg-1; x = 0.1, 151 mAhg-1 at 24 mAg-1) Nominal Na0.6Li0.2Mn0.8O2 with ' + 'the layered P3 structure (s.g. R3m) showed XPS evidence of holes in the O-2p ' + 'bands on removal of Na+ ions. A large voltage plateau at 4.1V versus Na+/Na ' + 'faded significantly over 50 cycles although the capacity in the range 20 ≤ V ' + '< 4.5V remained unchanged. Oxidation of the O-2p bands is not reversible. At ' + 'a rate of 0.5C, NMCO_750 exhibited a high gravimetric capacity of 84mAhg-1, ' + 'which was higher than NMCO_650 (80mAhg-1) and NMCO_850 (77mAhg-1).' + ), + + ( # 11 + 'A simple sol-gel method was used to synthesize the P2-type NMNCC and NMNC ' + 'cathode materials. Manganese(II)acetate tetrahydrate, nickel(II)acetate ' + 'tetrahydrate, cobalt(II)acetate tetrahydrate, copper(II)acetate monohydrate ' + 'and sodiumcarbonate anhydrous were dissolved into a citric acid solution ' + 'with a corresponding stoichiometric ratio', + + 'A simple sol-gel method was used to synthesize the P2-type NMNCC and NMNC ' + 'cathode materials. Manganeseacetate tetrahydrate, nickelacetate ' + 'tetrahydrate, cobaltacetate tetrahydrate, copperacetate monohydrate and ' + 'sodiumcarbonate anhydrous were dissolved into a citric acid solution with a ' + 'corresponding stoichiometric ratio' + ), + + ( # 12 + 'P2-Na0.67Ni0.33Mn0.67O2-yFy (y = 0 , 0.05 , 0.1 , 0.15 , abbreviated as ' + 'NaNMO, NaNMOF0.05, NaNMOF0.1, NaNMOF0.15, respectively) and ' + 'P2-Na0.67Ni0.33Mn0.67-xTixO1.9F0.1 (x = 0.1 , 0.2 , 0.3 , 0.4 , abbreviated ' + 'as NaNMTi0.1OF, NaNMTi0.2OF, NaNMTi0.3OF, and NaNMTi0.4OF, respectively) ' + 'were synthesized by solid-state reaction.', + + 'P2-Na0.67Ni0.33Mn0.67O2-yFy (y = 0 , 0.05 , 0.1 , 0.15 , abbreviated as ' + 'NaNMO, NaNMOF0.05, NaNMOF0.1, NaNMOF0.15, respectively) and ' + 'P2-Na0.67Ni0.33Mn0.67-xTixO1.9F0.1 (x = 0.1 , 0.2 , 0.3 , 0.4 , abbreviated ' + 'as NaNMTi0.1OF, NaNMTi0.2OF, NaNMTi0.3OF, and NaNMTi0.4OF, respectively) ' + 'were synthesized by solid-state reaction.' + ), + + ( # 13 + 'Here we synthesized an O3-NaNi0.5-xMn0.3Ti0.2ZrxO2 (NaNMTZ , x=0.02,0.05 , ' + 'NaNMTZ2, NaNMTZ5) by co-substituting NaNi0.5Mn0.5O2 (NaNM) with Ti and Zr. ' + 'In this work, the influence of co-substitution of Ti and Zr on ' + 'NaNi0.5Mn0.5O2 was studied.', + + 'Here we synthesized an O3-NaNi0.5-xMn0.3Ti0.2ZrxO2 (NaNMTZ,x=0.02,0.05 , ' + 'NaNMTZ2, NaNMTZ5) by co-substituting NaMn0.5Ni0.5O2 (NaNM) with Ti and Zr. ' + 'In this work, the influence of co-substitution of Ti and Zr on ' + 'NaMn0.5Ni0.5O2 was studied.' + ) + + ] + + for text in texts: + bat = BatteriesTextProcessor(text[0], special_normal=True) + self.assertEqual(' '.join(bat.processed_text), text[1])