|
|
""" |
|
|
B2NL-IntelligentTokenizer v6.2.1 - ์ค์ ์๋ํ๋ ์ถ๋ก ์ฝ๋ |
|
|
์ด ํ์ผ์ด ๋ฉ์ธ ์ฌ์ฉ๋ฒ์
๋๋ค. |
|
|
""" |
|
|
|
|
|
import torch |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1")) |
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1/core")) |
|
|
|
|
|
from core.unified_model import IntelligentTokenizerV62 |
|
|
from core.tokenizer import ByteTokenizerV62 |
|
|
|
|
|
|
|
|
class B2NLTokenizer: |
|
|
"""์ค์ ๋ก ์๋ํ๋ B2NL ํ ํฌ๋์ด์ """ |
|
|
|
|
|
def __init__(self, checkpoint_path: str = None): |
|
|
""" |
|
|
Args: |
|
|
checkpoint_path: ์ฒดํฌํฌ์ธํธ ๊ฒฝ๋ก (์์ผ๋ฉด ๊ธฐ๋ณธ๊ฐ ์ฌ์ฉ) |
|
|
""" |
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
|
|
|
if checkpoint_path is None: |
|
|
checkpoint_path = "D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt" |
|
|
|
|
|
|
|
|
self.model = IntelligentTokenizerV62() |
|
|
checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False) |
|
|
self.model.load_state_dict(checkpoint['model_state_dict']) |
|
|
self.model = self.model.to(self.device) |
|
|
self.model.eval() |
|
|
|
|
|
print(f"Model loaded successfully on {self.device}") |
|
|
|
|
|
def compress(self, text: str) -> dict: |
|
|
"""ํ
์คํธ๋ฅผ ์์ถ""" |
|
|
return self.model.compress(text) |
|
|
|
|
|
def reconstruct(self, text: str, temperature: float = 0.1) -> str: |
|
|
""" |
|
|
ํ
์คํธ๋ฅผ ์์ถ ํ ๋ณต์ (์ค์ ์๋ํ๋ ๋ฒ์ ) |
|
|
|
|
|
Args: |
|
|
text: ์
๋ ฅ ํ
์คํธ |
|
|
temperature: ์์ฑ ์จ๋ (๋ฎ์์๋ก ๊ฒฐ์ ์ ) |
|
|
|
|
|
Returns: |
|
|
๋ณต์๋ ํ
์คํธ |
|
|
""" |
|
|
|
|
|
tokenizer = self.model.tokenizer |
|
|
encoded = tokenizer.encode(text) |
|
|
|
|
|
if isinstance(encoded, dict): |
|
|
input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids'] |
|
|
attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask'] |
|
|
else: |
|
|
input_ids = encoded.unsqueeze(0) if encoded.dim() == 1 else encoded |
|
|
attention_mask = torch.ones_like(input_ids) |
|
|
|
|
|
input_ids = input_ids.to(self.device) |
|
|
attention_mask = attention_mask.to(self.device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
encoder_outputs = self.model.encoder( |
|
|
input_ids=input_ids, |
|
|
attention_mask=attention_mask |
|
|
) |
|
|
|
|
|
|
|
|
if 'all_hidden_states' in encoder_outputs: |
|
|
encoder_all_hidden = encoder_outputs['all_hidden_states'] |
|
|
else: |
|
|
compressed = encoder_outputs.get('compressed', encoder_outputs.get('hidden_states')) |
|
|
encoder_all_hidden = [compressed] * 4 |
|
|
|
|
|
|
|
|
batch_size = input_ids.size(0) |
|
|
max_length = 48 |
|
|
|
|
|
|
|
|
generated = torch.full((batch_size, 1), tokenizer.BOS, device=self.device) |
|
|
|
|
|
for step in range(max_length - 1): |
|
|
with torch.no_grad(): |
|
|
|
|
|
decoder_outputs = self.model.decoder( |
|
|
encoder_all_hidden=encoder_all_hidden, |
|
|
decoder_input_ids=generated, |
|
|
attention_mask=torch.ones_like(generated), |
|
|
use_cache=False |
|
|
) |
|
|
|
|
|
|
|
|
logits = decoder_outputs['logits'][:, -1, :] / temperature |
|
|
|
|
|
|
|
|
top_k = 10 |
|
|
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] |
|
|
logits[indices_to_remove] = float('-inf') |
|
|
|
|
|
|
|
|
probs = torch.nn.functional.softmax(logits, dim=-1) |
|
|
next_token = torch.multinomial(probs, num_samples=1) |
|
|
|
|
|
|
|
|
generated = torch.cat([generated, next_token], dim=1) |
|
|
|
|
|
|
|
|
if (next_token == tokenizer.EOS).all(): |
|
|
break |
|
|
|
|
|
|
|
|
if generated.dim() > 1: |
|
|
text = tokenizer.decode(generated[0]) |
|
|
else: |
|
|
text = tokenizer.decode(generated) |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
def test_tokenizer(): |
|
|
"""ํ ํฌ๋์ด์ ํ
์คํธ""" |
|
|
print("="*60) |
|
|
print("B2NL-IntelligentTokenizer v6.2.1 ํ
์คํธ") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
tokenizer = B2NLTokenizer() |
|
|
|
|
|
|
|
|
test_texts = [ |
|
|
"Hello, world!", |
|
|
"์๋
ํ์ธ์, ๋ฐ๊ฐ์ต๋๋ค.", |
|
|
"The quick brown fox jumps over the lazy dog.", |
|
|
"ไบบๅทฅๆบ่ฝๆๆฏๆญฃๅจๆนๅไธ็ใ", |
|
|
] |
|
|
|
|
|
for text in test_texts: |
|
|
print(f"\n์๋ณธ: {text}") |
|
|
|
|
|
|
|
|
compressed = tokenizer.compress(text) |
|
|
print(f"์์ถ๋ฅ : {compressed['compression_ratio']:.1f}:1 ({compressed['num_tokens']} ํ ํฐ)") |
|
|
|
|
|
|
|
|
reconstructed = tokenizer.reconstruct(text, temperature=0.1) |
|
|
print(f"๋ณต์: {reconstructed}") |
|
|
|
|
|
|
|
|
min_len = min(len(text), len(reconstructed)) |
|
|
accuracy = sum(1 for i in range(min_len) if text[i] == reconstructed[i]) / len(text) * 100 |
|
|
print(f"์ ํ๋: {accuracy:.1f}%") |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("Test completed!") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
|
|
|
def example_usage(): |
|
|
"""๊ฐ๋จํ ์ฌ์ฉ ์์ """ |
|
|
|
|
|
tokenizer = B2NLTokenizer() |
|
|
|
|
|
|
|
|
text = "์๋
ํ์ธ์, ๋ฐ๊ฐ์ต๋๋ค!" |
|
|
compressed = tokenizer.compress(text) |
|
|
print(f"์์ถ ๊ฒฐ๊ณผ: {compressed['compression_ratio']:.1f}:1") |
|
|
|
|
|
|
|
|
reconstructed = tokenizer.reconstruct(text) |
|
|
print(f"๋ณต์ ๊ฒฐ๊ณผ: {reconstructed}") |
|
|
|
|
|
return tokenizer |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
test_tokenizer() |