Skip to content

Commit

Permalink
Add a grammar modules with ready-to-use grammars
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Lapp authored and rlouf committed Jan 26, 2024
1 parent 5d67a5a commit c2b0f4f
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 0 deletions.
17 changes: 17 additions & 0 deletions docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,23 @@ print(result)
# 4*5*3*2*1/6*4*3*2*1/2*1*1*1/4*1*1*1/2*1*1*1/2*1*1/2*1*1*5*1/2*2*1*1/2*1*1*6*1*1/2*1*1*1*1*2*1*1*1*1
```


EBNF grammars can be cumbersome to write. This is why Outlines provides grammar definitions in the `outlines.grammars.` module

```python
from outlines import models, generate, grammars

model = models.transformers("mistralai/Mistral-7B-v0.1")
generator = generate.cfg(model, grammars.arithmetic, max_tokens=100)

result = generator("Write a series of operations on integers that return the number 5 ")
print(result)
# 100-2-75+50-18+27-501.
```

The available grammars are listed [here](https://github.com/outlines-dev/outlines/tree/main/outlines/grammars).


### Regex-guided generation

Slightly simpler, but no less useful, Outlines can generate text that is in the language of a [regular expression](https://www.regular-expressions.info/tutorial.html). For instance to force the model to generate IP addresses:
Expand Down
2 changes: 2 additions & 0 deletions outlines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Outlines is a Generative Model Programming Framework."""
import outlines.generate
import outlines.grammars
import outlines.models
import outlines.text.generate
from outlines.base import vectorize
Expand All @@ -14,4 +15,5 @@
"Function",
"prompt",
"vectorize",
"grammars",
]
2 changes: 2 additions & 0 deletions outlines/fsm/fsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from lark import Lark

# from outlines.fsm.parsing import PartialLark
from outlines import grammars
from outlines.caching import cache
from outlines.fsm.regex import create_fsm_index_tokenizer, make_deterministic_fsm

Expand Down Expand Up @@ -200,6 +201,7 @@ def __init__(self, cfg_string: str, tokenizer: "Tokenizer"):
propagate_positions=False,
maybe_placeholders=False,
regex=True,
import_paths=[grammars.GRAMMAR_PATH],
)
self.terminal_regexps = dict()
for terminal in self.parser.terminals:
Expand Down
14 changes: 14 additions & 0 deletions outlines/grammars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pathlib import Path

GRAMMAR_PATH = Path(__file__).parent / "grammars"


def read_grammar(grammar_file_name, base_grammar_path=GRAMMAR_PATH):
"""Read grammar file from default grammar path"""
full_path = base_grammar_path / grammar_file_name
with open(full_path) as file:
return file.read()


arithmetic = read_grammar("arithmetic.lark")
json = read_grammar("json.lark")
18 changes: 18 additions & 0 deletions outlines/grammars/arithmetic.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
?start: sum

?sum: product
| sum "+" product -> add
| sum "-" product -> sub

?product: atom
| product "*" atom -> mul
| product "/" atom -> div

?atom: NUMBER -> number
| "-" atom -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
80 changes: 80 additions & 0 deletions outlines/grammars/common.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Adapted from https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark

// Lark License:
// Copyright © 2017 Erez Shinan
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
// the Software, and to permit persons to whom the Software is furnished to do so,
// subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


// Basic terminals for common use


//
// Numbers
//

DIGIT: "0".."9"
HEXDIGIT: "a".."f"|"A".."F"|DIGIT

INT: DIGIT+
SIGNED_INT: ["+"|"-"] INT
DECIMAL: INT "." INT? | "." INT

// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
_EXP: ("e"|"E") SIGNED_INT
FLOAT: INT _EXP | DECIMAL _EXP?
SIGNED_FLOAT: ["+"|"-"] FLOAT

NUMBER: FLOAT | INT
SIGNED_NUMBER: ["+"|"-"] NUMBER

//
// TODO: Working escaped_string
//
UNESCAPED_STRING: /\"[^"]*\"/



//
// Names (Variables)
//
LCASE_LETTER: "a".."z"
UCASE_LETTER: "A".."Z"

LETTER: UCASE_LETTER | LCASE_LETTER
WORD: LETTER+

CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*


//
// Whitespace
//
WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+

CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+


// Comments
SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /(.|\n)*?/ "*/"
SQL_COMMENT: /--[^\n]*/
19 changes: 19 additions & 0 deletions outlines/grammars/json.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
?start: value

?value: object
| array
| UNESCAPED_STRING
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null

array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : UNESCAPED_STRING ":" value

%import common.UNESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS

%ignore WS
31 changes: 31 additions & 0 deletions tests/test_grammars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pytest

import outlines.grammars as grammars
from outlines.fsm.fsm import CFGFSM


@pytest.mark.parametrize("grammar", [grammars.json, grammars.arithmetic])
def test_grammar_module(grammar):
class MockTokenizer:
vocabulary = {"(": 1, ")": 2, "a": 3, "eos": 4}
special_tokens = {"eos"}
eos_token = "eos"
eos_token_id = 4

def convert_token_to_string(self, token):
return token

@property
def inverse_vocabulary(self):
return {v: k for k, v in self.vocabulary.items()}

def decode(self, token_ids):
return [self.inverse_vocabulary[t] for t in token_ids]

cfg_str = """
start: s
s: "(" s ")" | /a+/
"""
tokenizer = MockTokenizer()
fsm = CFGFSM(cfg_str, tokenizer)
assert isinstance(fsm, CFGFSM)

0 comments on commit c2b0f4f

Please sign in to comment.