Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
GGNoWayBack committed Feb 20, 2024
2 parents 1d47324 + ff1972a commit e828cb3
Show file tree
Hide file tree
Showing 19 changed files with 5,233 additions and 60 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/compre-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: tests

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.7' ]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "pip"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest -W ignore::DeprecationWarning
23 changes: 11 additions & 12 deletions cathodedataextractor/nlp/cner.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def prompt_tag(self, cem: str, normalize=False) -> str:
return 'element'
elif cem in ELEMENTS_NAMES_UL:
return 'element_name'
elif cem in POLYATOMIC_IONS:
return 'polyatomic_ions'
elif cem in SIMPLE_COMPOUND:
return 'simple'
elif self.is_word(cem):
Expand All @@ -164,29 +166,28 @@ def __generalized_elements(cem: str):
if normalize:
cem = self.normalized_compound_formula(cem)
shape = word_shape(cem)
likely_abb = True if 'XXX' in shape else False
likely_abb = any_func(shape, ABB_SHAPE)
if not likely_abb and (shape.startswith('d') or 'b.b' in shape):
return 'other'
elif 'Xx-Xx-Xx' in shape: # Nax(Cu-Fe-Mn)O2
return 'irregular_shape'
parse_ = self.is_compound_formula(cem)
if parse_ and parse_[1]:
if not likely_abb and parse_ and parse_[1]:
# NaCoMnO NaMnNiCuFeTiOF
if len([i for i in parse_[1] if i in TRANSITION_ME]) >= 2 and parse_[0] == ''.join(
parse_[1]):
return 'is_likely_abbreviation'
# 'Na/Li/Ni/Mn/Co'
elif set(cem.split('/')).difference(ELEMENTS):
elif not set(cem.split('/')).difference(ELEMENTS):
return 'synthetic'
# Nax(Cu-Fe-Mn)O2
elif 'Xx-Xx-Xx' in shape or all(len(i) <= 2 for i in cem.split('-')):
elif all(len(i) <= 2 for i in cem.split('-')):
return 'irregular_shape'
elif '/' not in cem:
return 'synthetic'
else:
return 'other'
elif un_chem_abbreviation.search(cem):
return 'other'
elif '-' in cem and 'Xx-Xx-Xx' in shape:
return 'irregular_shape'
elif any_func(cem, SOLVENT_NAMES):
return 'solvent_names'
elif any_func(cem, RAW_MATERIAL):
Expand All @@ -196,8 +197,6 @@ def __generalized_elements(cem: str):
elif cem in OTHER or any_func(cem, OTHER_IN) or pattern.match(cem):
return 'other'

likely_abb_ = any_func(shape, ABB_SHAPE)

if not likely_abb:
try:
_elements = [element.value for element in Composition(cem).elements]
Expand All @@ -214,7 +213,7 @@ def __generalized_elements(cem: str):
return 'raw_material'
except (ValueError, AttributeError, KeyError):
ele_result = {}
if self.prompt_element in cem and not likely_abb_:
if self.prompt_element in cem and not likely_abb:
ele_result = __generalized_elements(cem)
if len(ele_result) > 2 and 'O' in ele_result:
if 'M' in ele_result:
Expand All @@ -227,11 +226,11 @@ def __generalized_elements(cem: str):
if 'Xx*Xx' in shape:
return 'other'
elif self.prompt_element in cem or self.prompt_element[0] in cem:
if not ele_result or likely_abb_:
if not ele_result or likely_abb:
return 'is_likely_abbreviation'
else:
return 'raw_material'
return 'is_likely_abbreviation' if likely_abb_ or shape in ['XX', 'Xd.d'] and not any_func(cem, ['=',
return 'is_likely_abbreviation' if likely_abb or shape in ['XX', 'Xd.d'] and not any_func(cem, ['=',
',']) else 'other'

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion cathodedataextractor/parse/regex_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def end_parentheses(cem: str):

@lru_cache(None)
def polyatomic_ions(string):
return any(_ == string for _ in set(POLYATOMIC_IONS))
return any(_ == string for _ in POLYATOMIC_IONS)


@lru_cache(None)
Expand Down
8 changes: 4 additions & 4 deletions cathodedataextractor/parse/regex_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
"other_me": ['B', 'Al', 'Ga', 'In', 'Tl', 'Si', 'Ge', 'Sn', 'Pb', 'As', 'Sb', 'Bi', 'Te'],
}

POLYATOMIC_IONS = ['CO3', 'PO4', 'PO3', 'P2O7', 'NH4', 'NO3', 'NO2', 'SO4', 'SO3', 'OH', 'CN', 'SiO4']
POLYATOMIC_IONS = {'CO3', 'PO4', 'PO3', 'P2O7', 'NH4', 'NO3', 'NO2', 'SO4', 'SO3', 'OH', 'CN', 'SiO4'}
SIMPLE_COMPOUND = ['NaHO', 'NaCl', 'NaF', 'NaBr', 'Na2S2', 'Na2CO3']
# m_anions = ['H2PO4', 'HPO4', 'HCO3', 'HSO4', 'HSO3', 'C2O4']
# d_anions = ['CO3', 'PO4', 'PO3', 'NH4', 'NO3', 'NO2', 'SO4', 'SO3', 'OH', 'CN']
Expand All @@ -84,15 +84,15 @@
GREEK_CHARS = {chr(i) for i in range(945, 970)}

OTHER = {'JCPDS', 'JCPSD', 'RT', 'SOC', 'ICP', 'SIB', 'DFT', 'STA', 'ICSD', 'HITACH', 'NIST', 'PAL', 'TXM', 'SXRPD',
'PVDF', 'DFPT', 'CNTs', 'USP', 'ALD', 'PH3', 'CV', 'CS', 'PC', 'OC', 'CB', 'ND', 'TG', 'NPs'}
'PVDF', 'DFPT', 'CNTs', 'USP', 'ALD', 'PH3', 'CV', 'CS', 'PC', 'OC', 'CB', 'ND', 'TG', 'NPs',
'Na-rich', 'Na-ion'}

OTHER_IN = ['PDF', 'No.', '↔', 'Nae', 'Fig',
'Na-', # Na-rich Na-ion
'AB', # AB ABCABC...
'Non', # Non-sub
]

ABB_SHAPE = ['XXdd', 'XXX', 'XXd', 'XxXX']
ABB_SHAPE = ['XXdd', 'XXX', 'XXd', 'XxXX', 'Xddd']

NUMBER_REGEX = r'\d++[\d. ]*+'

Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# coding=utf-8
3 changes: 3 additions & 0 deletions tests/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import os

TEST_PATH = os.path.dirname(__file__)
Loading

0 comments on commit e828cb3

Please sign in to comment.