jvamvas's picture
Upload lemmatizer package
e49dd9d verified
"""A script from https://github.com/zhopto3/morpho-baseline defining
the edit tree class learned from the pledari grond dictionaries"""
from difflib import SequenceMatcher
class EditTreeNode(object):
def __init__(self, val):
self.left = None
self.right = None
self.val = val
def apply(self, word):
"""Applies learned edit rules to a string;
I've changed the original code so that it lemmatizes rather than completes a pradigm"""
assert isinstance(word, str)
if isinstance(self.val[0], str): # replace
if word == self.val[0]:
return self.val[1]
return -1
if isinstance(self.val[0], int): # split
assert isinstance(self.left, EditTreeNode)
assert isinstance(self.right, EditTreeNode)
word_left = word[: self.val[0]]
word_mid = word[self.val[0] : len(word) - self.val[1]]
word_right = word[len(word) - self.val[1] :]
word_left = self.left.apply(word_left)
word_right = self.right.apply(word_right)
if word_left == -1 or word_right == -1:
return -1
out = word_left + word_mid + word_right
return out
def __str__(self):
if self.left is None: # leaf
return str(self.val)
left_str = str(self.left)
right_str = str(self.right)
ret = str(self.val) + "\n"
for line in left_str.split("\n"):
ret += " " + line + "\n"
for line in right_str.split("\n"):
ret += " " + line + "\n"
return ret.strip()
def __hash__(self):
return hash(self.__str__())
def __eq__(self, other):
if not isinstance(other, EditTreeNode):
return False
if (
(self.left == other.left)
and (self.right == other.right)
and (self.val == other.val)
):
return True
return False
def longestSubstring(str1, str2):
"""Get the longest substring between two str"""
seqMatch = SequenceMatcher(None, str1, str2)
match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
return (match.a, match.b, match.size)
def editTree(str1, str2):
"""Define the edit rules between any two strings"""
if str1 is None or str2 is None:
return None
idx1, idx2, size = longestSubstring(str1, str2)
if size == 0:
return EditTreeNode((str1, str2))
node = EditTreeNode((idx1, len(str1) - idx1 - size))
node.left = editTree(str1[:idx1], str2[:idx2])
node.right = editTree(str1[idx1 + size :], str2[idx2 + size :])
return node