Skip to content

Commit

Permalink
Test examples using doctest (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
vbkaisetsu authored Apr 13, 2023
1 parent 044ce52 commit 86c1251
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 52 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@ jobs:
- name: Test package
run: |
python -m pip install --upgrade pip
pip install -r requirements-dev.txt
pip install -r requirements-dev.txt zstandard
python -c "import zstandard;zstandard.ZstdDecompressor().copy_stream(open('tests/data/system.dic.zst','rb'),open('tests/data/system.dic','wb'))"
pip install vibrato --no-index --find-links target/wheels --force-reinstall
mypy --strict tests
pytest
python -m doctest README.md
python -m doctest docs/source/examples.rst
pack-sdist:
needs: [ test ]
Expand Down
65 changes: 27 additions & 38 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,43 +40,40 @@ To perform tokenization, follow [the document of Vibrato](https://github.com/daa
Check the version number as shown below to use compatible models:

```python
import vibrato
vibrato.VIBRATO_VERSION
#=> "0.5.0"
>>> import vibrato
>>> vibrato.VIBRATO_VERSION
'0.5.0'

```

Examples:

```python
import vibrato
>>> import vibrato

>>> with open('tests/data/system.dic', 'rb') as fp:
... tokenizer = vibrato.Vibrato(fp.read())

with open('path/to/system.dic', 'rb') as fp:
dict_data = fp.read()
tokenizer = vibrato.Vibrato(dict_data)
>>> tokens = tokenizer.tokenize('社長は火星猫だ')

tokens = tokenizer.tokenize('社長は火星猫だ')
>>> len(tokens)
5

len(tokens)
#=> 5
>>> tokens[0]
Token { surface: "社長", feature: "名詞,普通名詞,一般,*" }

list(tokens)
#=> [Token { surface: "社長", feature: "名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,," },
# Token { surface: "は", feature: "助詞,係助詞,*,*,*,*,は,ハ,ワ,," },
# Token { surface: "火星", feature: "名詞,一般,*,*,*,*,火星,カセイ,カセイ,," },
# Token { surface: "猫", feature: "名詞,一般,*,*,*,*,猫,ネコ,ネコ,," },
# Token { surface: "だ", feature: "助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ,," }]
>>> tokens[0].surface()
'社長'

tokens[0].surface()
#=> '社長'
>>> tokens[0].feature()
'名詞,普通名詞,一般,*'

tokens[0].feature()
#=> '名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,,'
>>> tokens[0].start()
0

tokens[0].start()
#=> 0
>>> tokens[0].end()
2

tokens[0].end()
#=> 2
```

## Note for distributed models
Expand All @@ -85,22 +82,14 @@ The distributed models are compressed in zstd format. If you want to load these
you must decompress them outside the API.

```python
import vibrato
import zstandard # zstandard package in PyPI

dctx = zstandard.ZstdDecompressor()
with open('path/to/system.dic.zst', 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vibrato.Vibrato(dict_reader.read())
```

## Documentation
>>> import vibrato
>>> import zstandard # zstandard package in PyPI

Use the help function to show the API reference.
>>> dctx = zstandard.ZstdDecompressor()
>>> with open('tests/data/system.dic.zst', 'rb') as fp:
... with dctx.stream_reader(fp) as dict_reader:
... tokenizer = vibrato.Vibrato(dict_reader.read())

```python
import vibrato
help(vibrato)
```

## License
Expand Down
21 changes: 8 additions & 13 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,22 @@ Examples:
>>> import vibrato
>>> with open('path/to/system.dic', 'rb') as fp:
... dict_data = fp.read()
>>> tokenizer = vibrato.Vibrato(dict_data)
>>> with open('tests/data/system.dic', 'rb') as fp:
... tokenizer = vibrato.Vibrato(fp.read())
>>> tokens = tokenizer.tokenize('社長は火星猫だ')
>>> len(tokens)
5
>>> list(tokens)
[Token { surface: "社長", feature: "名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,," },
Token { surface: "", feature: "助詞,係助詞,*,*,*,*,は,ハ,ワ,," },
Token { surface: "火星", feature: "名詞,一般,*,*,*,*,火星,カセイ,カセイ,," },
Token { surface: "", feature: "名詞,一般,*,*,*,*,猫,ネコ,ネコ,," },
Token { surface: "", feature: "助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ,," }]
>>> tokens[0]
Token { surface: "社長", feature: "名詞,普通名詞,一般,*" }
>>> tokens[0].surface()
'社長'
>>> tokens[0].feature()
'名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,,'
'名詞,普通名詞,一般,*'
>>> tokens[0].start()
0
Expand All @@ -56,6 +51,6 @@ you must decompress them outside the API:
>>> import zstandard # zstandard package in PyPI
>>> dctx = zstandard.ZstdDecompressor()
>>> with open('path/to/system.dic.zst', 'rb') as fp:
... dict_reader = dctx.stream_reader(fp)
>>> tokenizer = vibrato.Vibrato(dict_reader.read())
>>> with open('tests/data/system.dic.zst', 'rb') as fp:
... with dctx.stream_reader(fp) as dict_reader:
... tokenizer = vibrato.Vibrato(dict_reader.read())
Binary file added tests/data/system.dic.zst
Binary file not shown.

0 comments on commit 86c1251

Please sign in to comment.