Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improvements for R-pronouns and relativisation (#10) #11

Merged
merged 1 commit into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions mwe_query/canonicalform.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,14 @@ def mknewnode(stree, mwetop, atts, annotations):
newnode.attrib['maxnodecount'] = f'{len(stree)}'
return newnode


def expandnonheadwordnode(nonheadwordnode, phrasenodeproperties):
phraserel = gav(nonheadwordnode, 'rel')
newnonheadwordnode = copy.copy(nonheadwordnode)
newnonheadwordnode.attrib['rel'] = 'hd'
phrasenode = ET.Element('node', attrib=phrasenodeproperties)
phrasenode.attrib['rel'] = phraserel
phrasenode.append(newnonheadwordnode)
return phrasenode
def zullenheadclause(stree: SynTree) -> bool:
if stree.tag == 'node':
cat = gav(stree, 'cat')
Expand Down Expand Up @@ -1016,9 +1023,10 @@ def newgenvariants(stree: SynTree) -> List[SynTree]:
Rpronounobj1node = copy.copy(obj1node)
Rpronounobj1node.attrib['lemma'] = 'er|hier|daar|waar|ergens|nergens|overal'
Rpronounobj1node.attrib['pt'] = 'vnw'
newphrase = expandnonheadwordnode(Rpronounobj1node, {})
for child in newppnode2:
newppnode2.remove(child)
newppnode2.append(Rpronounobj1node)
newppnode2.append(newphrase)
newppnode2.append(newvz2)

# pp with R-pronoun object which has been replaced by a full NO with a dummymod
Expand Down Expand Up @@ -1047,11 +1055,15 @@ def newgenvariants(stree: SynTree) -> List[SynTree]:
pppronadvvcnode.append(pronadvnode1)
pppronadvvcnode.append(newvcnode)

# pp's with a pronominal adverb. e.g. daarnaar
pprel = gav(ppnode, 'rel')
pronadvnode = getpronadv(vzlemma, pprel)
pronadvppnode = expandnonheadwordnode(pronadvnode, {'cat': 'pp', 'rel': pprel})
pronadvnode.attrib['rel'] = 'hd'
pronadvppnode.append(pronadvnode)

alternativesnode = mkalternativesnode([[ppnode], [newppnode2], [newppnode3], [
pppobj1vcnode], [pppronadvvcnode], [pronadvnode]])
pppobj1vcnode], [pppronadvvcnode], [pronadvppnode]])
parent.append(alternativesnode)

vblgennpnodeids = newstree.xpath(
Expand Down Expand Up @@ -1404,8 +1416,9 @@ def relpronsubst(stree: SynTree) -> SynTree:
def expandfull(stree: SynTree) -> SynTree:
# possibly add getlcat
stree1 = relpronsubst(stree)
stree2 = indextransform(stree1)
return stree2
stree2 = expandnonheadwords(stree1)
stree3 = indextransform(stree2)
return stree3


def gettopnode(stree):
Expand Down
7 changes: 6 additions & 1 deletion mwe_query/lcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import copy
import lxml.etree as ET

dummy = 'dummy'


def expandnonheadwords(stree: SynTree) -> SynTree:
# it is presupposed that the input stree is not None
Expand Down Expand Up @@ -47,7 +49,8 @@ def getlcatatt(node: SynTree) -> str:

def mkphrase(child: SynTree) -> SynTree:
newnode = ET.Element('node')
newnode.attrib['id'] = child.attrib['id'] + 'a'
if 'íd' in child.attrib:
newnode.attrib['id'] = child.attrib['id'] + 'a'
lcat = getlcatatt(child)
if lcat in validcats:
newnode.attrib['cat'] = lcat
Expand Down Expand Up @@ -176,6 +179,8 @@ def getlcat(node: SynTree, prel=None) -> str: # noqa: C901
result = 'np'
elif pt == 'spec':
result = None
elif pt == dummy:
result = None
else:
print('Unknown att value (pt) encountered in:')
ET.dump(node)
Expand Down
52 changes: 52 additions & 0 deletions mwe_query/trymwes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from sastadev.alpinoparsing import parse
from lcat import expandnonheadwords
from sastadev.treebankfunctions import indextransform
from lxml import etree
from canonicalform import generatequeries, expandfull

debug = False

geenhaankraaien = ('0geen *haan zal naar iets kraaien',
['Daar kraait geen haan naar', 'Hier heeft geen haan naar gekraaid',
'geen haan kraaide daarnaar', 'geen haan kraaide ernaar dat hij niet kwam',
'geen haan kraaide er naar dat hij niet kwam',
'er is geen haan die daar naar kraait', ]
)

def select(mweutts, utt=None):
if utt is None:
result = mweutts
else:
result = (mweutts[0], [mweutts[1][utt]])
return result

def getparses(utterances):
uttparses = []
for utterance in utterances:
uttparse = parse(utterance)
uttparses.append(uttparse)
return uttparses

def trysomemwes():
mwe, utterances = select(geenhaankraaien)
mwequeries = generatequeries(mwe)
labeledmwequeries = (('MWEQ', mwequeries[0]), ('NMQ', mwequeries[1]), ('MLQ', mwequeries[2]))
uttparses = getparses(utterances)
for utterance, uttparse in zip(utterances, uttparses):
print(f'{utterance}:')
expandeduttparse = expandfull(uttparse)
if debug:
etree.dump(expandeduttparse)
for label, mwequery in labeledmwequeries:
results = expandeduttparse.xpath(mwequery)
if debug:
print('Found hits:')
for result in results:
etree.dump(result)
print(f'{label}: {len(results)}')




if __name__ == '__main__':
trysomemwes()
Loading