ggunio commited on
Commit
7aabfdc
ยท
verified ยท
1 Parent(s): 318d977

Upload demo_poc.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. demo_poc.py +266 -0
demo_poc.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ POC ๋ฐ๋ชจ ์Šคํฌ๋ฆฝํŠธ - ๊ธด ํ…์ŠคํŠธ ์ž๋™ ๋ถ„ํ•  ์ฒ˜๋ฆฌ
5
+ """
6
+
7
+ import torch
8
+ import sys
9
+ import io
10
+ from pathlib import Path
11
+ import time
12
+
13
+ # UTF-8 ์ธ์ฝ”๋”ฉ ์„ค์ •
14
+ if sys.stdout.encoding != 'utf-8':
15
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
16
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
17
+
18
+ sys.path.append(str(Path(__file__).parent))
19
+
20
+ from core.boundary_aware_model import BoundaryAwareTokenizerModel
21
+ from src.core.byte_tokenizer_v6 import ByteTokenizerV6
22
+
23
+ # Device
24
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
25
+
26
+ class IntelligentTokenizerPOC:
27
+ """POC ๋ฐ๋ชจ์šฉ ํด๋ž˜์Šค"""
28
+
29
+ def __init__(self, checkpoint_path="checkpoints/unified/latest_checkpoint.pt"):
30
+ print("="*70)
31
+ print("INTELLIGENT TOKENIZER v6.0 - POC Demo")
32
+ print("="*70)
33
+ print(f"Device: {device}")
34
+ print(f"Loading checkpoint...")
35
+
36
+ # ์ฒดํฌํฌ์ธํŠธ ๋กœ๋“œ
37
+ checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
38
+ self.model = BoundaryAwareTokenizerModel(**checkpoint['model_config'])
39
+ self.model.load_state_dict(checkpoint['model_state_dict'])
40
+ self.model = self.model.to(device)
41
+ self.model.eval()
42
+
43
+ self.tokenizer = ByteTokenizerV6()
44
+ self.max_chunk_size = 250 # 256๋ณด๋‹ค ์•ฝ๊ฐ„ ์ž‘๊ฒŒ (์•ˆ์ „ ๋งˆ์ง„)
45
+
46
+ print(f"Model loaded: Epoch {checkpoint['epoch']}, Loss {checkpoint['loss']:.4f}")
47
+ print(f"Current limitation: 256 bytes per chunk")
48
+ print(f"(Due to POC development constraints and limited GPU resources)")
49
+ print("="*70)
50
+ print()
51
+
52
+ def process_text(self, text: str, show_details=True):
53
+ """ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ (์ž๋™ ๋ถ„ํ• )"""
54
+
55
+ # ๋ฐ”์ดํŠธ๋กœ ๋ณ€ํ™˜
56
+ text_bytes = text.encode('utf-8')
57
+ total_bytes = len(text_bytes)
58
+
59
+ if show_details:
60
+ print(f"Input text: {text[:100]}..." if len(text) > 100 else f"Input text: {text}")
61
+ print(f"Total bytes: {total_bytes}")
62
+
63
+ # 256 ๋ฐ”์ดํŠธ ์ดˆ๊ณผ์‹œ ์ž๋™ ๋ถ„ํ• 
64
+ if total_bytes > self.max_chunk_size:
65
+ chunks = self._split_text_safely(text)
66
+ if show_details:
67
+ print(f"Auto-splitting into {len(chunks)} chunks (256 byte limit for POC)")
68
+ print("Note: Production version will handle up to 4096+ bytes")
69
+ print("-"*50)
70
+
71
+ results = []
72
+ total_compressed = 0
73
+
74
+ for i, chunk in enumerate(chunks):
75
+ if show_details:
76
+ print(f"\nChunk {i+1}/{len(chunks)}:")
77
+ result = self._process_single_chunk(chunk, show_details)
78
+ results.append(result)
79
+ total_compressed += result['compressed_tokens']
80
+
81
+ # ์ „์ฒด ํ†ต๊ณ„
82
+ if show_details:
83
+ print("\n" + "="*50)
84
+ print("OVERALL RESULTS:")
85
+ print(f"Total input: {total_bytes} bytes")
86
+ print(f"Total compressed: {total_compressed} tokens")
87
+ print(f"Compression ratio: {total_bytes/total_compressed:.2f}x")
88
+ print(f"Average accuracy: {sum(r['accuracy'] for r in results)/len(results):.1%}")
89
+
90
+ return results
91
+
92
+ else:
93
+ # ๋‹จ์ผ ์ฒญํฌ ์ฒ˜๋ฆฌ
94
+ return self._process_single_chunk(text, show_details)
95
+
96
+ def _split_text_safely(self, text: str):
97
+ """UTF-8 ๊ฒฝ๊ณ„๋ฅผ ๊ณ ๋ คํ•œ ์•ˆ์ „ํ•œ ํ…์ŠคํŠธ ๋ถ„ํ• """
98
+ chunks = []
99
+ text_bytes = text.encode('utf-8')
100
+
101
+ start = 0
102
+ while start < len(text_bytes):
103
+ # ์ฒญํฌ ํฌ๊ธฐ ๊ฒฐ์ •
104
+ end = min(start + self.max_chunk_size, len(text_bytes))
105
+
106
+ # UTF-8 ๊ฒฝ๊ณ„ ํ™•์ธ (ํ•œ๊ธ€์€ 3๋ฐ”์ดํŠธ)
107
+ while end > start and end < len(text_bytes):
108
+ try:
109
+ # ๋””์ฝ”๋”ฉ ์‹œ๋„
110
+ chunk = text_bytes[start:end].decode('utf-8')
111
+ break
112
+ except UnicodeDecodeError:
113
+ # UTF-8 ๊ฒฝ๊ณ„๊ฐ€ ์•„๋‹ˆ๋ฉด 1๋ฐ”์ดํŠธ ๋’ค๋กœ
114
+ end -= 1
115
+
116
+ if end > start:
117
+ chunk = text_bytes[start:end].decode('utf-8')
118
+ chunks.append(chunk)
119
+ start = end
120
+ else:
121
+ break
122
+
123
+ return chunks
124
+
125
+ def _process_single_chunk(self, text: str, show_details=True):
126
+ """๋‹จ์ผ ์ฒญํฌ ์ฒ˜๋ฆฌ"""
127
+
128
+ # ์ธ์ฝ”๋”ฉ
129
+ encoded = self.tokenizer.encode(text)
130
+ byte_ids = encoded['input_ids']
131
+ input_ids = torch.tensor([byte_ids], device=device)
132
+ attention_mask = torch.tensor([encoded['attention_mask']], device=device)
133
+
134
+ with torch.no_grad():
135
+ # ์••์ถ•
136
+ start_time = time.time()
137
+ encoder_outputs = self.model.encoder(input_ids, attention_mask)
138
+ encoder_hidden = encoder_outputs['last_hidden_state']
139
+ compression_time = time.time() - start_time
140
+
141
+ compressed_tokens = encoder_hidden.shape[1]
142
+ compression_ratio = len(byte_ids) / compressed_tokens
143
+
144
+ # ๋ณต์› (Teacher Forcing)
145
+ if len(byte_ids) > 1:
146
+ decoder_input = input_ids[:, :-1]
147
+ labels = input_ids[:, 1:]
148
+
149
+ outputs = self.model(
150
+ input_ids=input_ids,
151
+ attention_mask=attention_mask,
152
+ decoder_input_ids=decoder_input,
153
+ labels=labels,
154
+ use_cross_attention=True
155
+ )
156
+
157
+ predictions = torch.argmax(outputs['logits'], dim=-1)
158
+ accuracy = (predictions == labels).float().mean().item()
159
+ else:
160
+ accuracy = 1.0
161
+
162
+ if show_details:
163
+ print(f" Input: {len(byte_ids)} bytes")
164
+ print(f" Compressed: {compressed_tokens} tokens ({compression_ratio:.2f}x)")
165
+ print(f" Accuracy: {accuracy:.1%}")
166
+ print(f" Processing time: {compression_time*1000:.1f}ms")
167
+
168
+ return {
169
+ 'text': text,
170
+ 'input_bytes': len(byte_ids),
171
+ 'compressed_tokens': compressed_tokens,
172
+ 'compression_ratio': compression_ratio,
173
+ 'accuracy': accuracy,
174
+ 'time_ms': compression_time * 1000
175
+ }
176
+
177
+ def benchmark_languages(self):
178
+ """๋‹ค๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ"""
179
+ print("\n" + "="*70)
180
+ print("MULTILINGUAL BENCHMARK")
181
+ print("="*70)
182
+
183
+ test_samples = {
184
+ 'English': "The quick brown fox jumps over the lazy dog",
185
+ 'Korean': "์•ˆ๋…•ํ•˜์„ธ์š”. ์˜ค๋Š˜ ๋‚ ์”จ๊ฐ€ ์ •๋ง ์ข‹๋„ค์š”",
186
+ 'Chinese': "ไปŠๅคฉๅคฉๆฐ”ๅพˆๅฅฝ",
187
+ 'Japanese': "ใ“ใ‚“ใซใกใฏ",
188
+ 'Spanish': "Hola, ยฟcรณmo estรกs?",
189
+ 'Arabic': "ู…ุฑุญุจุง ุจูƒ",
190
+ 'Russian': "ะŸั€ะธะฒะตั‚, ะบะฐะบ ะดะตะปะฐ?",
191
+ }
192
+
193
+ for lang, text in test_samples.items():
194
+ print(f"\n{lang}:")
195
+ self._process_single_chunk(text, show_details=True)
196
+
197
+ def explain_advantages(self):
198
+ """์žฅ์  ์„ค๋ช…"""
199
+ print("\n" + "="*70)
200
+ print("KEY ADVANTAGES")
201
+ print("="*70)
202
+ print("""
203
+ 1. PURE LEARNING-BASED
204
+ - No vocabulary files (260 fixed bytes vs 50K+ tokens)
205
+ - No language-specific rules
206
+ - Learns compression patterns from data
207
+
208
+ 2. MULTILINGUAL EQUALITY
209
+ - All 204 languages treated equally
210
+ - No vocabulary bias towards English
211
+ - Better for low-resource languages
212
+
213
+ 3. COMPRESSION CAPABILITY
214
+ - Current: 2-3x compression (POC stage)
215
+ - Target: 5-10x compression (with more training)
216
+ - API cost reduction: 50-80%
217
+
218
+ 4. CURRENT LIMITATIONS (POC)
219
+ - 256 byte chunks (due to limited GPU resources)
220
+ - Will expand to 4096+ bytes post-POC
221
+ - Training on personal RTX 3060 (4 months development)
222
+
223
+ 5. FUTURE ROADMAP
224
+ - Multimodal support (text + image + audio)
225
+ - Dynamic compression levels
226
+ - Real-time streaming mode
227
+ """)
228
+ print("="*70)
229
+
230
+ def main():
231
+ """๋ฉ”์ธ ๋ฐ๋ชจ"""
232
+ poc = IntelligentTokenizerPOC()
233
+
234
+ # 1. ์งง์€ ํ…์ŠคํŠธ ๋ฐ๋ชจ
235
+ print("\n### SHORT TEXT DEMO ###")
236
+ poc.process_text("Hello, world!")
237
+ poc.process_text("์•ˆ๋…•ํ•˜์„ธ์š”. ๋ฐ˜๊ฐ‘์Šต๋‹ˆ๋‹ค.")
238
+
239
+ # 2. ๊ธด ํ…์ŠคํŠธ ์ž๋™ ๋ถ„ํ•  ๋ฐ๋ชจ
240
+ print("\n### LONG TEXT AUTO-SPLIT DEMO ###")
241
+ long_text = """
242
+ ์ธ๊ณต์ง€๋Šฅ ๊ธฐ์ˆ ์ด ๋น ๋ฅด๊ฒŒ ๋ฐœ์ „ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ํŠนํžˆ ์ž์—ฐ์–ด ์ฒ˜๋ฆฌ ๋ถ„์•ผ์—์„œ
243
+ ๋†€๋ผ์šด ์„ฑ๊ณผ๋ฅผ ๋ณด์ด๊ณ  ์žˆ์œผ๋ฉฐ, ์ด๋Š” ์šฐ๋ฆฌ์˜ ์ผ์ƒ์ƒํ™œ์—๋„ ํฐ ์˜ํ–ฅ์„
244
+ ๋ฏธ์น˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ์•ž์œผ๋กœ ๋” ๋งŽ์€ ํ˜์‹ ์ด ๊ธฐ๋Œ€๋ฉ๋‹ˆ๋‹ค.
245
+
246
+ The development of artificial intelligence is accelerating rapidly.
247
+ Natural language processing, in particular, has shown remarkable progress,
248
+ significantly impacting our daily lives. We can expect even more innovations
249
+ in the near future.
250
+ """
251
+ poc.process_text(long_text)
252
+
253
+ # 3. ๋‹ค๊ตญ์–ด ๋ฒค์น˜๋งˆํฌ
254
+ poc.benchmark_languages()
255
+
256
+ # 4. ์žฅ์  ์„ค๋ช…
257
+ poc.explain_advantages()
258
+
259
+ print("\n" + "="*70)
260
+ print("POC DEMO COMPLETE")
261
+ print("Developed in 4 months by a solo developer with no prior AI experience")
262
+ print("Contact: [your contact info]")
263
+ print("="*70)
264
+
265
+ if __name__ == "__main__":
266
+ main()