Gergo Szabo commited on
Commit
01ce750
·
1 Parent(s): bfa3dae

Changed datasets to dataframes for ease-of-use. (#238)

Browse files
examples/Unsloth/Demo.lynxkite.json CHANGED
The diff for this file is too large to render. See raw diff
 
examples/Unsloth/boxes.py CHANGED
@@ -1,9 +1,9 @@
1
  import enum
2
  from lynxkite_core import ops
3
- from lynxkite_graph_analytics.core import Bundle
4
  import unsloth
5
  import trl
6
- from datasets import load_dataset
7
  import unsloth.chat_templates
8
  from transformers.training_args import OptimizerNames
9
  from transformers.trainer_utils import SchedulerType
@@ -15,10 +15,10 @@ op = ops.op_registration("LynxKite Graph Analytics", "Unsloth")
15
  def load_base_model(
16
  *,
17
  model_name: str,
18
- max_seq_length=2048,
19
- load_in_4bit=False,
20
- load_in_8bit=False,
21
- full_finetuning=False,
22
  ):
23
  model, tokenizer = unsloth.FastModel.from_pretrained(
24
  model_name=model_name,
@@ -59,46 +59,46 @@ def configure_lora(bundle: Bundle, *, r=128, lora_dropout=0, random_state=1, ran
59
 
60
  @op("Load HF dataset", slow=True, cache=False)
61
  def load_hf_dataset(*, name: str, split="train[:10000]") -> Bundle:
62
- return Bundle(other={"dataset": load_dataset(name, split=split)})
63
 
64
 
65
  @op("Convert to ChatML", slow=True, cache=False)
66
  def convert_to_chatml(
67
  bundle: Bundle,
68
  *,
69
- system_column_name: str,
70
- user_column_name: str,
71
- assistant_column_name: str,
72
- save_as="conversations",
 
73
  ):
74
  bundle = bundle.copy()
75
- ds = bundle.other["dataset"]
76
- bundle.other["dataset"] = ds.map(
77
- lambda e: {
78
- save_as: [
79
- {"role": "system", "content": e[system_column_name]},
80
- {"role": "user", "content": e[user_column_name]},
81
- {"role": "assistant", "content": e[assistant_column_name]},
82
- ]
83
- }
84
  )
85
  return bundle
86
 
87
 
88
  @op("Apply chat template", slow=True, cache=False)
89
- def apply_chat_template(bundle: Bundle, *, conversations_field="conversations", save_as="text"):
 
 
 
 
 
 
90
  bundle = bundle.copy()
91
  tokenizer = bundle.other["tokenizer"]
92
- bundle.other["dataset"] = bundle.other["dataset"].map(
93
- lambda e: {
94
- save_as: [
95
- tokenizer.apply_chat_template(
96
- convo, tokenize=False, add_generation_prompt=False
97
- ).removeprefix("<bos>")
98
- for convo in e[conversations_field]
99
- ]
100
- },
101
- batched=True,
102
  )
103
  return bundle
104
 
@@ -107,7 +107,8 @@ def apply_chat_template(bundle: Bundle, *, conversations_field="conversations",
107
  def train_llm(
108
  bundle: Bundle,
109
  *,
110
- dataset_text_field="text",
 
111
  train_on_responses_only=True,
112
  per_device_train_batch_size=8,
113
  gradient_accumulation_steps=1,
@@ -123,7 +124,7 @@ def train_llm(
123
  ):
124
  model = bundle.other["model"]
125
  tokenizer = bundle.other["tokenizer"]
126
- dataset = bundle.other["dataset"]
127
  trainer = trl.SFTTrainer(
128
  model=model,
129
  tokenizer=tokenizer,
 
1
  import enum
2
  from lynxkite_core import ops
3
+ from lynxkite_graph_analytics.core import Bundle, TableName, ColumnNameByTableName
4
  import unsloth
5
  import trl
6
+ from datasets import load_dataset, Dataset
7
  import unsloth.chat_templates
8
  from transformers.training_args import OptimizerNames
9
  from transformers.trainer_utils import SchedulerType
 
15
  def load_base_model(
16
  *,
17
  model_name: str,
18
+ max_seq_length: int = 2048,
19
+ load_in_4bit: bool = False,
20
+ load_in_8bit: bool = False,
21
+ full_finetuning: bool = False,
22
  ):
23
  model, tokenizer = unsloth.FastModel.from_pretrained(
24
  model_name=model_name,
 
59
 
60
  @op("Load HF dataset", slow=True, cache=False)
61
  def load_hf_dataset(*, name: str, split="train[:10000]") -> Bundle:
62
+ return Bundle(dfs={"dataset": load_dataset(name, split=split).to_pandas()})
63
 
64
 
65
  @op("Convert to ChatML", slow=True, cache=False)
66
  def convert_to_chatml(
67
  bundle: Bundle,
68
  *,
69
+ table_name: TableName,
70
+ system_column_name: ColumnNameByTableName,
71
+ user_column_name: ColumnNameByTableName,
72
+ assistant_column_name: ColumnNameByTableName,
73
+ save_as: str = "conversations",
74
  ):
75
  bundle = bundle.copy()
76
+ ds = bundle.dfs[table_name]
77
+ bundle.dfs[table_name][save_as] = ds.apply(
78
+ lambda e: [
79
+ {"role": "system", "content": e[system_column_name]},
80
+ {"role": "user", "content": e[user_column_name]},
81
+ {"role": "assistant", "content": e[assistant_column_name]},
82
+ ],
83
+ axis=1,
 
84
  )
85
  return bundle
86
 
87
 
88
  @op("Apply chat template", slow=True, cache=False)
89
+ def apply_chat_template(
90
+ bundle: Bundle,
91
+ *,
92
+ table_name: TableName,
93
+ conversations_field: ColumnNameByTableName,
94
+ save_as="text",
95
+ ):
96
  bundle = bundle.copy()
97
  tokenizer = bundle.other["tokenizer"]
98
+ bundle.dfs[table_name][save_as] = bundle.dfs[table_name][conversations_field].map(
99
+ lambda e: tokenizer.apply_chat_template(
100
+ e, tokenize=False, add_generation_prompt=False
101
+ ).removeprefix("<bos>"),
 
 
 
 
 
 
102
  )
103
  return bundle
104
 
 
107
  def train_llm(
108
  bundle: Bundle,
109
  *,
110
+ table_name: TableName,
111
+ dataset_text_field: ColumnNameByTableName,
112
  train_on_responses_only=True,
113
  per_device_train_batch_size=8,
114
  gradient_accumulation_steps=1,
 
124
  ):
125
  model = bundle.other["model"]
126
  tokenizer = bundle.other["tokenizer"]
127
+ dataset = Dataset.from_pandas(bundle.dfs[table_name])
128
  trainer = trl.SFTTrainer(
129
  model=model,
130
  tokenizer=tokenizer,