Spaces:

abnerh
/

video-to-subs

Paused

video-to-subs / clean_text.py

german and spanish

0cc2cbd almost 4 years ago

947 Bytes

	import os, re, string
	import subprocess
	from textblob_de import TextBlobDE as TextBlob


	def clean_english(text):
	clean_text = re.sub(r' ', ' ', text)
	clean_text = re.sub(r'\bi\s', 'I ', clean_text)
	clean_text = re.sub(r'\si$', ' I', clean_text)
	clean_text = re.sub(r'i\'', 'I\'', clean_text)

	return clean_text

	def clean_german(text):
	text = text.translate(str.maketrans('', '', string.punctuation))

	# Tokenize German text
	blob = TextBlob(text)
	pos = blob.tags

	# Get nouns and capitalize
	nouns = {}
	for idx in pos:
	if idx[1] == 'NN' and len(idx[0]) > 1:
	nouns[idx[0]] = idx[0].capitalize()

	if len(nouns) != 0:
	pattern = re.compile("\|".join(nouns.keys()))
	text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text)

	return text


	def clean_spanish(text):
	clean_text = text.translate(str.maketrans('', '', string.punctuation))
	clean_text = re.sub(r' ', ' ', clean_text)

	return clean_text