Spaces:
Running
Running
Gergo Szabo
commited on
Commit
·
01ce750
1
Parent(s):
bfa3dae
Changed datasets to dataframes for ease-of-use. (#238)
Browse files- examples/Unsloth/Demo.lynxkite.json +0 -0
- examples/Unsloth/boxes.py +34 -33
examples/Unsloth/Demo.lynxkite.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/Unsloth/boxes.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import enum
|
| 2 |
from lynxkite_core import ops
|
| 3 |
-
from lynxkite_graph_analytics.core import Bundle
|
| 4 |
import unsloth
|
| 5 |
import trl
|
| 6 |
-
from datasets import load_dataset
|
| 7 |
import unsloth.chat_templates
|
| 8 |
from transformers.training_args import OptimizerNames
|
| 9 |
from transformers.trainer_utils import SchedulerType
|
|
@@ -15,10 +15,10 @@ op = ops.op_registration("LynxKite Graph Analytics", "Unsloth")
|
|
| 15 |
def load_base_model(
|
| 16 |
*,
|
| 17 |
model_name: str,
|
| 18 |
-
max_seq_length=2048,
|
| 19 |
-
load_in_4bit=False,
|
| 20 |
-
load_in_8bit=False,
|
| 21 |
-
full_finetuning=False,
|
| 22 |
):
|
| 23 |
model, tokenizer = unsloth.FastModel.from_pretrained(
|
| 24 |
model_name=model_name,
|
|
@@ -59,46 +59,46 @@ def configure_lora(bundle: Bundle, *, r=128, lora_dropout=0, random_state=1, ran
|
|
| 59 |
|
| 60 |
@op("Load HF dataset", slow=True, cache=False)
|
| 61 |
def load_hf_dataset(*, name: str, split="train[:10000]") -> Bundle:
|
| 62 |
-
return Bundle(
|
| 63 |
|
| 64 |
|
| 65 |
@op("Convert to ChatML", slow=True, cache=False)
|
| 66 |
def convert_to_chatml(
|
| 67 |
bundle: Bundle,
|
| 68 |
*,
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
| 73 |
):
|
| 74 |
bundle = bundle.copy()
|
| 75 |
-
ds = bundle.
|
| 76 |
-
bundle.
|
| 77 |
-
lambda e:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
}
|
| 84 |
)
|
| 85 |
return bundle
|
| 86 |
|
| 87 |
|
| 88 |
@op("Apply chat template", slow=True, cache=False)
|
| 89 |
-
def apply_chat_template(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
bundle = bundle.copy()
|
| 91 |
tokenizer = bundle.other["tokenizer"]
|
| 92 |
-
bundle.
|
| 93 |
-
lambda e:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
convo, tokenize=False, add_generation_prompt=False
|
| 97 |
-
).removeprefix("<bos>")
|
| 98 |
-
for convo in e[conversations_field]
|
| 99 |
-
]
|
| 100 |
-
},
|
| 101 |
-
batched=True,
|
| 102 |
)
|
| 103 |
return bundle
|
| 104 |
|
|
@@ -107,7 +107,8 @@ def apply_chat_template(bundle: Bundle, *, conversations_field="conversations",
|
|
| 107 |
def train_llm(
|
| 108 |
bundle: Bundle,
|
| 109 |
*,
|
| 110 |
-
|
|
|
|
| 111 |
train_on_responses_only=True,
|
| 112 |
per_device_train_batch_size=8,
|
| 113 |
gradient_accumulation_steps=1,
|
|
@@ -123,7 +124,7 @@ def train_llm(
|
|
| 123 |
):
|
| 124 |
model = bundle.other["model"]
|
| 125 |
tokenizer = bundle.other["tokenizer"]
|
| 126 |
-
dataset = bundle.
|
| 127 |
trainer = trl.SFTTrainer(
|
| 128 |
model=model,
|
| 129 |
tokenizer=tokenizer,
|
|
|
|
| 1 |
import enum
|
| 2 |
from lynxkite_core import ops
|
| 3 |
+
from lynxkite_graph_analytics.core import Bundle, TableName, ColumnNameByTableName
|
| 4 |
import unsloth
|
| 5 |
import trl
|
| 6 |
+
from datasets import load_dataset, Dataset
|
| 7 |
import unsloth.chat_templates
|
| 8 |
from transformers.training_args import OptimizerNames
|
| 9 |
from transformers.trainer_utils import SchedulerType
|
|
|
|
| 15 |
def load_base_model(
|
| 16 |
*,
|
| 17 |
model_name: str,
|
| 18 |
+
max_seq_length: int = 2048,
|
| 19 |
+
load_in_4bit: bool = False,
|
| 20 |
+
load_in_8bit: bool = False,
|
| 21 |
+
full_finetuning: bool = False,
|
| 22 |
):
|
| 23 |
model, tokenizer = unsloth.FastModel.from_pretrained(
|
| 24 |
model_name=model_name,
|
|
|
|
| 59 |
|
| 60 |
@op("Load HF dataset", slow=True, cache=False)
|
| 61 |
def load_hf_dataset(*, name: str, split="train[:10000]") -> Bundle:
|
| 62 |
+
return Bundle(dfs={"dataset": load_dataset(name, split=split).to_pandas()})
|
| 63 |
|
| 64 |
|
| 65 |
@op("Convert to ChatML", slow=True, cache=False)
|
| 66 |
def convert_to_chatml(
|
| 67 |
bundle: Bundle,
|
| 68 |
*,
|
| 69 |
+
table_name: TableName,
|
| 70 |
+
system_column_name: ColumnNameByTableName,
|
| 71 |
+
user_column_name: ColumnNameByTableName,
|
| 72 |
+
assistant_column_name: ColumnNameByTableName,
|
| 73 |
+
save_as: str = "conversations",
|
| 74 |
):
|
| 75 |
bundle = bundle.copy()
|
| 76 |
+
ds = bundle.dfs[table_name]
|
| 77 |
+
bundle.dfs[table_name][save_as] = ds.apply(
|
| 78 |
+
lambda e: [
|
| 79 |
+
{"role": "system", "content": e[system_column_name]},
|
| 80 |
+
{"role": "user", "content": e[user_column_name]},
|
| 81 |
+
{"role": "assistant", "content": e[assistant_column_name]},
|
| 82 |
+
],
|
| 83 |
+
axis=1,
|
|
|
|
| 84 |
)
|
| 85 |
return bundle
|
| 86 |
|
| 87 |
|
| 88 |
@op("Apply chat template", slow=True, cache=False)
|
| 89 |
+
def apply_chat_template(
|
| 90 |
+
bundle: Bundle,
|
| 91 |
+
*,
|
| 92 |
+
table_name: TableName,
|
| 93 |
+
conversations_field: ColumnNameByTableName,
|
| 94 |
+
save_as="text",
|
| 95 |
+
):
|
| 96 |
bundle = bundle.copy()
|
| 97 |
tokenizer = bundle.other["tokenizer"]
|
| 98 |
+
bundle.dfs[table_name][save_as] = bundle.dfs[table_name][conversations_field].map(
|
| 99 |
+
lambda e: tokenizer.apply_chat_template(
|
| 100 |
+
e, tokenize=False, add_generation_prompt=False
|
| 101 |
+
).removeprefix("<bos>"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
)
|
| 103 |
return bundle
|
| 104 |
|
|
|
|
| 107 |
def train_llm(
|
| 108 |
bundle: Bundle,
|
| 109 |
*,
|
| 110 |
+
table_name: TableName,
|
| 111 |
+
dataset_text_field: ColumnNameByTableName,
|
| 112 |
train_on_responses_only=True,
|
| 113 |
per_device_train_batch_size=8,
|
| 114 |
gradient_accumulation_steps=1,
|
|
|
|
| 124 |
):
|
| 125 |
model = bundle.other["model"]
|
| 126 |
tokenizer = bundle.other["tokenizer"]
|
| 127 |
+
dataset = Dataset.from_pandas(bundle.dfs[table_name])
|
| 128 |
trainer = trl.SFTTrainer(
|
| 129 |
model=model,
|
| 130 |
tokenizer=tokenizer,
|