| from transformers import T5TokenizerFast, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq |
| from datasets import load_dataset, Dataset |
| import os |
| import json |
|
|
| |
| tokenizer = T5TokenizerFast.from_pretrained("minicoderx-tokenizer") |
|
|
| |
| def load_jsonl(path): |
| with open(path) as f: |
| data = [json.loads(line) for line in f] |
| return Dataset.from_dict({ |
| "input": [x["input"] for x in data], |
| "output": [x["output"] for x in data] |
| }) |
|
|
| dataset = load_jsonl("data/train.jsonl") |
|
|
| |
| def tokenize(batch): |
| return tokenizer(batch["input"], padding="max_length", truncation=True, max_length=128) |
|
|
| def tokenize_labels(batch): |
| labels = tokenizer(batch["output"], padding="max_length", truncation=True, max_length=128) |
| batch["labels"] = labels["input_ids"] |
| return batch |
|
|
| dataset = dataset.map(tokenize) |
| dataset = dataset.map(tokenize_labels) |
|
|
| |
| model = T5ForConditionalGeneration.from_pretrained("t5-small") |
|
|
| |
| training_args = TrainingArguments( |
| output_dir="minicoderx-model", |
| per_device_train_batch_size=4, |
| num_train_epochs=3, |
| logging_steps=10, |
| save_strategy="epoch", |
| evaluation_strategy="no", |
| save_total_limit=2, |
| fp16=True, |
| overwrite_output_dir=True, |
| ) |
|
|
| data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=dataset, |
| data_collator=data_collator, |
| tokenizer=tokenizer |
| ) |
|
|
| |
| trainer.train() |
|
|
| |
| trainer.save_model("minicoderx-model") |
| tokenizer.save_pretrained("minicoderx-model") |
|
|
| print("Training complete and model saved.") |