Spaces:

DmitryRyumin
/

BiBiER

Running

App Files Files Community

BiBiER / synthetic_utils /text_generation.py

farbverlauf

gpu

960b1a0 6 months ago

raw

history blame contribute delete

3.83 kB

	import logging
	import torch
	import random
	from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

	class TextGenerator:
	def __init__(
	self,
	model_name="gpt2",
	device="cuda",
	max_new_tokens=50,
	temperature=1.0,
	top_p=0.95,
	seed=None
	):
	self.model_name = model_name
	self.device = device
	self.max_new_tokens = max_new_tokens
	self.temperature = temperature
	self.top_p = top_p
	self.seed = seed

	logging.info(f"[TextGenerator] Загрузка модели {model_name} на {device} ...")
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

	if seed is not None:
	set_seed(seed)
	logging.info(f"[TextGenerator] Сид генерации установлен через transformers.set_seed({seed})")
	else:
	logging.info("[TextGenerator] Сид генерации не установлен (seed=None)")

	# --- Примеры для few-shot обучения ---
	self.fewshot_examples = [
	("happy", "We finally made it!", "We finally made it! I’ve never felt so alive and proud of what we accomplished."),
	("sad", "He didn't come back.", "He didn't come back. I waited all night, hoping to see him again."),
	("anger", "Why would you do that?", "Why would you do that? You had no right to interfere!"),
	("fear", "Did you hear that?", "Did you hear that? Something’s moving outside the window..."),
	("surprise", "Oh wow, really?", "Oh wow, really? I didn’t see that coming at all!"),
	("disgust", "That smell is awful.", "That smell is awful. I feel like I’m going to be sick."),
	("neutral", "Let's meet at noon.", "Let's meet at noon. We’ll have plenty of time to talk then.")
	]

	def build_prompt(self, emotion: str, partial_text: str) -> str:
	few_shot = random.sample(self.fewshot_examples, 2)
	examples_str = ""
	for emo, text, cont in few_shot:
	examples_str += (
	f"Example:\n"
	f"Emotion: {emo}\n"
	f"Text: {text}\n"
	f"Continuation: {cont}\n\n"
	)

	prompt = (
	"You are a helpful assistant that generates emotionally-aligned sentence continuations.\n"
	"You must include the original sentence in the output, and then continue it in a fluent and emotionally appropriate way.\n\n"
	f"{examples_str}"
	f"Now try:\n"
	f"Emotion: {emotion}\n"
	f"Text: {partial_text}\n"
	f"Continuation:"
	)
	return prompt

	def generate_text(self, emotion: str, partial_text: str = "") -> str:
	prompt = self.build_prompt(emotion, partial_text)
	logging.debug(f"[TextGenerator] prompt:\n{prompt}")

	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

	output_ids = self.model.generate(
	**inputs,
	max_new_tokens=self.max_new_tokens,
	do_sample=True,
	top_p=self.top_p,
	temperature=self.temperature,
	pad_token_id=self.tokenizer.eos_token_id
	)

	full_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
	logging.debug(f"[TextGenerator] decoded:\n{full_text}")

	# Вытаскиваем то, что идёт после последнего "Continuation:"
	if "Continuation:" in full_text:
	result = full_text.split("Continuation:")[-1].strip()
	else:
	result = full_text.strip()

	result = result.split("\n")[0].strip()
	return result