Spaces:

lynx-analytics
/

lynxkite

Running

App Files Files Community

Gergo Szabo commited on 29 days ago

Commit

01ce750

1 Parent(s): bfa3dae

Changed datasets to dataframes for ease-of-use. (#238)

Browse files

Files changed (2) hide show

examples/Unsloth/Demo.lynxkite.json +0 -0
examples/Unsloth/boxes.py +34 -33

examples/Unsloth/Demo.lynxkite.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/Unsloth/boxes.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import enum
 from lynxkite_core import ops
-from lynxkite_graph_analytics.core import Bundle
 import unsloth
 import trl
-from datasets import load_dataset
 import unsloth.chat_templates
 from transformers.training_args import OptimizerNames
 from transformers.trainer_utils import SchedulerType
@@ -15,10 +15,10 @@ op = ops.op_registration("LynxKite Graph Analytics", "Unsloth")
 def load_base_model(
     *,
     model_name: str,
-    max_seq_length=2048,
-    load_in_4bit=False,
-    load_in_8bit=False,
-    full_finetuning=False,
 ):
     model, tokenizer = unsloth.FastModel.from_pretrained(
         model_name=model_name,
@@ -59,46 +59,46 @@ def configure_lora(bundle: Bundle, *, r=128, lora_dropout=0, random_state=1, ran
 @op("Load HF dataset", slow=True, cache=False)
 def load_hf_dataset(*, name: str, split="train[:10000]") -> Bundle:
-    return Bundle(other={"dataset": load_dataset(name, split=split)})
 @op("Convert to ChatML", slow=True, cache=False)
 def convert_to_chatml(
     bundle: Bundle,
     *,
-    system_column_name: str,
-    user_column_name: str,
-    assistant_column_name: str,
-    save_as="conversations",
 ):
     bundle = bundle.copy()
-    ds = bundle.other["dataset"]
-    bundle.other["dataset"] = ds.map(
-        lambda e: {
-            save_as: [
-                {"role": "system", "content": e[system_column_name]},
-                {"role": "user", "content": e[user_column_name]},
-                {"role": "assistant", "content": e[assistant_column_name]},
-            ]
-        }
     )
     return bundle
 @op("Apply chat template", slow=True, cache=False)
-def apply_chat_template(bundle: Bundle, *, conversations_field="conversations", save_as="text"):
     bundle = bundle.copy()
     tokenizer = bundle.other["tokenizer"]
-    bundle.other["dataset"] = bundle.other["dataset"].map(
-        lambda e: {
-            save_as: [
-                tokenizer.apply_chat_template(
-                    convo, tokenize=False, add_generation_prompt=False
-                ).removeprefix("<bos>")
-                for convo in e[conversations_field]
-            ]
-        },
-        batched=True,
     )
     return bundle
@@ -107,7 +107,8 @@ def apply_chat_template(bundle: Bundle, *, conversations_field="conversations",
 def train_llm(
     bundle: Bundle,
     *,
-    dataset_text_field="text",
     train_on_responses_only=True,
     per_device_train_batch_size=8,
     gradient_accumulation_steps=1,
@@ -123,7 +124,7 @@ def train_llm(
 ):
     model = bundle.other["model"]
     tokenizer = bundle.other["tokenizer"]
-    dataset = bundle.other["dataset"]
     trainer = trl.SFTTrainer(
         model=model,
         tokenizer=tokenizer,

 import enum
 from lynxkite_core import ops
+from lynxkite_graph_analytics.core import Bundle, TableName, ColumnNameByTableName
 import unsloth
 import trl
+from datasets import load_dataset, Dataset
 import unsloth.chat_templates
 from transformers.training_args import OptimizerNames
 from transformers.trainer_utils import SchedulerType
 def load_base_model(
     *,
     model_name: str,
+    max_seq_length: int = 2048,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
+    full_finetuning: bool = False,
 ):
     model, tokenizer = unsloth.FastModel.from_pretrained(
         model_name=model_name,
 @op("Load HF dataset", slow=True, cache=False)
 def load_hf_dataset(*, name: str, split="train[:10000]") -> Bundle:
+    return Bundle(dfs={"dataset": load_dataset(name, split=split).to_pandas()})
 @op("Convert to ChatML", slow=True, cache=False)
 def convert_to_chatml(
     bundle: Bundle,
     *,
+    table_name: TableName,
+    system_column_name: ColumnNameByTableName,
+    user_column_name: ColumnNameByTableName,
+    assistant_column_name: ColumnNameByTableName,
+    save_as: str = "conversations",
 ):
     bundle = bundle.copy()
+    ds = bundle.dfs[table_name]
+    bundle.dfs[table_name][save_as] = ds.apply(
+        lambda e: [
+            {"role": "system", "content": e[system_column_name]},
+            {"role": "user", "content": e[user_column_name]},
+            {"role": "assistant", "content": e[assistant_column_name]},
+        ],
+        axis=1,
     )
     return bundle
 @op("Apply chat template", slow=True, cache=False)
+def apply_chat_template(
+    bundle: Bundle,
+    *,
+    table_name: TableName,
+    conversations_field: ColumnNameByTableName,
+    save_as="text",
+):
     bundle = bundle.copy()
     tokenizer = bundle.other["tokenizer"]
+    bundle.dfs[table_name][save_as] = bundle.dfs[table_name][conversations_field].map(
+        lambda e: tokenizer.apply_chat_template(
+            e, tokenize=False, add_generation_prompt=False
+        ).removeprefix("<bos>"),
     )
     return bundle
 def train_llm(
     bundle: Bundle,
     *,
+    table_name: TableName,
+    dataset_text_field: ColumnNameByTableName,
     train_on_responses_only=True,
     per_device_train_batch_size=8,
     gradient_accumulation_steps=1,
 ):
     model = bundle.other["model"]
     tokenizer = bundle.other["tokenizer"]
+    dataset = Dataset.from_pandas(bundle.dfs[table_name])
     trainer = trl.SFTTrainer(
         model=model,
         tokenizer=tokenizer,