Spaces:
Paused
Paused
| import os, re, string | |
| import subprocess | |
| from textblob_de import TextBlobDE as TextBlob | |
| def clean_english(text): | |
| clean_text = re.sub(r' ', ' ', text) | |
| clean_text = re.sub(r'\bi\s', 'I ', clean_text) | |
| clean_text = re.sub(r'\si$', ' I', clean_text) | |
| clean_text = re.sub(r'i\'', 'I\'', clean_text) | |
| return clean_text | |
| def clean_german(text): | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| # Tokenize German text | |
| blob = TextBlob(text) | |
| pos = blob.tags | |
| # Get nouns and capitalize | |
| nouns = {} | |
| for idx in pos: | |
| if idx[1] == 'NN' and len(idx[0]) > 1: | |
| nouns[idx[0]] = idx[0].capitalize() | |
| if len(nouns) != 0: | |
| pattern = re.compile("|".join(nouns.keys())) | |
| text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text) | |
| return text | |
| def clean_spanish(text): | |
| clean_text = text.translate(str.maketrans('', '', string.punctuation)) | |
| clean_text = re.sub(r' ', ' ', clean_text) | |
| return clean_text | |