lucabadiali commited on
Commit
52bb109
·
1 Parent(s): d2882c3

Added local vs HF mode

Browse files
Files changed (4) hide show
  1. .gitignore +3 -3
  2. src/app/app.py +3 -7
  3. src/app/config.py +24 -0
  4. src/train_model.py +241 -0
.gitignore CHANGED
@@ -1,6 +1,6 @@
1
  ProjectEnv
2
- saved_model
3
  .pytest_cache
4
- artifacts
5
  data/__pycache__
6
- data/dataset
 
 
1
  ProjectEnv
2
+ models
3
  .pytest_cache
 
4
  data/__pycache__
5
+ data/dataset
6
+ app/__pycache__
src/app/app.py CHANGED
@@ -9,7 +9,7 @@ import csv
9
  import requests
10
  from typing import Union, List
11
  import torch
12
-
13
 
14
 
15
  app = FastAPI()
@@ -18,17 +18,13 @@ app = FastAPI()
18
  class SentimentQuery(BaseModel):
19
  input_texts: Union[str, List[str]]
20
 
21
- task='sentiment'
22
- mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
23
  with urllib.request.urlopen(mapping_link) as f:
24
  html = f.read().decode('utf-8').split("\n")
25
  csvreader = csv.reader(html, delimiter='\t')
26
  labels = [row[1] for row in csvreader if len(row) > 1]
27
 
28
- MODEL = f"cardiffnlp/twitter-roberta-base-{task}-latest"
29
- model = AutoModelForSequenceClassification.from_pretrained(MODEL)
30
- tokenizer = AutoTokenizer.from_pretrained(MODEL)
31
-
32
 
33
 
34
  @app.post("/predict")
 
9
  import requests
10
  from typing import Union, List
11
  import torch
12
+ from .config import MODEL_SOURCE, ModelSource, load_model_and_tokenizer
13
 
14
 
15
  app = FastAPI()
 
18
  class SentimentQuery(BaseModel):
19
  input_texts: Union[str, List[str]]
20
 
21
+ mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
 
22
  with urllib.request.urlopen(mapping_link) as f:
23
  html = f.read().decode('utf-8').split("\n")
24
  csvreader = csv.reader(html, delimiter='\t')
25
  labels = [row[1] for row in csvreader if len(row) > 1]
26
 
27
+ tokenizer, model = load_model_and_tokenizer(MODEL_SOURCE)
 
 
 
28
 
29
 
30
  @app.post("/predict")
src/app/config.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from enum import Enum
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ from pathlib import Path
5
+
6
+
7
+ class ModelSource(str, Enum):
8
+ HF = "hf"
9
+ LOCAL = "local"
10
+
11
+ MODEL_SOURCE = ModelSource(os.getenv("MODEL_SOURCE", "hf"))
12
+ HF_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
13
+
14
+
15
+ def load_model_and_tokenizer(MODEL_SOURCE):
16
+ if MODEL_SOURCE == ModelSource.HF: # use the latest model available in the HF hub
17
+ tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
18
+ model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL)
19
+ else: # use a locally fine tuned model
20
+ local_model_path = Path("models/saved_model")
21
+ assert local_model_path.exists(), """No local model was found. Run 'python3 src/train_model.py'"""
22
+ tokenizer = AutoTokenizer.from_pretrained("models/saved_tokenizer")
23
+ model = AutoModelForSequenceClassification.from_pretrained("models/saved_model")
24
+ return tokenizer, model
src/train_model.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.utils import preprocess
2
+ import urllib
3
+ import csv
4
+ import os
5
+ import torch
6
+ from transformers import (
7
+ AutoTokenizer, AutoModelForSequenceClassification,
8
+ TrainingArguments, Trainer, EarlyStoppingCallback,
9
+ DataCollatorWithPadding
10
+ )
11
+ from datasets import load_from_disk
12
+
13
+
14
+ # --- Device detection ---
15
+ if torch.cuda.is_available():
16
+ device = "cuda"
17
+ use_bf16 = torch.cuda.is_bf16_supported()
18
+ use_fp16 = not use_bf16
19
+ elif torch.backends.mps.is_available():
20
+ device = "mps"
21
+ use_bf16 = False
22
+ use_fp16 = False
23
+ else:
24
+ device = "cpu"
25
+ use_bf16 = False
26
+ use_fp16 = False
27
+
28
+ if device == "cuda" and use_bf16:
29
+ load_dtype = torch.bfloat16
30
+ elif device == "cuda" and use_fp16:
31
+ load_dtype = torch.float16
32
+ else:
33
+ load_dtype = torch.float32 # MPS/CPU -> fp32
34
+
35
+ import evaluate
36
+
37
+
38
+ MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
39
+
40
+ # download label mapping
41
+ mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
42
+ with urllib.request.urlopen(mapping_link) as f:
43
+ html = f.read().decode('utf-8').split("\n")
44
+ csvreader = csv.reader(html, delimiter='\t')
45
+ labels = [row[1] for row in csvreader if len(row) > 1]
46
+
47
+
48
+
49
+ # --- Tokenizer: keep short max_length to save memory ---
50
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, model_max_length=128)
51
+
52
+
53
+ def tokenize_function(batch):
54
+ return tokenizer(
55
+ batch["text"],
56
+ truncation=True,
57
+ max_length=128,
58
+ padding=False # we will pad per-batch via DataCollatorWithPadding
59
+ )
60
+
61
+ data_collator = DataCollatorWithPadding(
62
+ tokenizer=tokenizer,
63
+ pad_to_multiple_of=8 if (device == "cuda" and (use_bf16 or use_fp16)) else None
64
+ )
65
+
66
+
67
+ model = AutoModelForSequenceClassification.from_pretrained(
68
+ MODEL, num_labels=3, torch_dtype=load_dtype
69
+ )
70
+
71
+ model.gradient_checkpointing_enable()
72
+ model.config.use_cache = False
73
+
74
+ #### DATASET LOADING
75
+
76
+
77
+ dataset_path = "data/dataset" # same path you used before
78
+ dataset = load_from_disk(dataset_path)
79
+
80
+
81
+ # ---- COPY-PASTE FROM HERE ----
82
+ import os
83
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
84
+
85
+ from datasets import DatasetDict
86
+ from transformers import AutoTokenizer, DataCollatorWithPadding
87
+
88
+ def make_trainer_ready(
89
+ raw_ds: DatasetDict,
90
+ model_name: str = "cardiffnlp/twitter-roberta-base-sep2022",
91
+ train_frac: float = 0.2,
92
+ val_frac: float = 0.2,
93
+ seed: int = 42,
94
+ label_col: str = "label",
95
+ text_col: str = "text",
96
+ max_length: int = 128,
97
+ pad_to_multiple_of_8_on_cuda: bool = True,
98
+ ):
99
+ """
100
+ Returns (train_ds, eval_ds, data_collator, tokenizer) ready for HF Trainer.
101
+ - Ensures there's a validation split (creates one from train if missing).
102
+ - Takes fractional subsets, stratified by label when possible.
103
+ - Tokenizes and keeps only the columns Trainer expects.
104
+ """
105
+ assert 0 < train_frac <= 1.0, "train_frac must be in (0,1]."
106
+ assert 0 < val_frac <= 1.0, "val_frac must be in (0,1]."
107
+ assert text_col in raw_ds["train"].column_names, f"Missing text column: {text_col}"
108
+ assert label_col in raw_ds["train"].column_names, f"Missing label column: {label_col}"
109
+
110
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, model_max_length=max_length)
111
+
112
+ # 1) Ensure we have a validation split
113
+ if "validation" not in raw_ds:
114
+ split = raw_ds["train"].train_test_split(
115
+ test_size=val_frac,
116
+ stratify_by_column=label_col if label_col in raw_ds["train"].column_names else None,
117
+ seed=seed,
118
+ )
119
+ raw_ds = DatasetDict(train=split["train"], validation=split["test"])
120
+ else:
121
+ raw_ds = DatasetDict(train=raw_ds["train"], validation=raw_ds["validation"])
122
+
123
+ # 2) Take fractions (stratified when possible)
124
+ def take_frac(ds, frac):
125
+ if frac >= 1.0: # keep full split
126
+ return ds
127
+ out = ds.train_test_split(
128
+ test_size=1 - frac,
129
+ stratify_by_column=label_col if label_col in ds.column_names else None,
130
+ seed=seed,
131
+ )
132
+ return out["train"] # the kept fraction
133
+
134
+ small_train = take_frac(raw_ds["train"], train_frac)
135
+ small_eval = take_frac(raw_ds["validation"], val_frac)
136
+
137
+ # 3) Tokenize (no padding here; we pad per-batch with the collator)
138
+ def tok(batch):
139
+ return tokenizer(batch[text_col], truncation=True, max_length=max_length, padding=False)
140
+
141
+ small_train_tok = small_train.map(tok, batched=True, remove_columns=[c for c in small_train.column_names if c not in (text_col, label_col)])
142
+ small_eval_tok = small_eval.map(tok, batched=True, remove_columns=[c for c in small_eval.column_names if c not in (text_col, label_col)])
143
+
144
+ # 4) Keep only the columns Trainer needs
145
+ keep_cols = ["input_ids", "attention_mask", label_col]
146
+ small_train_tok = small_train_tok.remove_columns([c for c in small_train_tok.column_names if c not in keep_cols])
147
+ small_eval_tok = small_eval_tok.remove_columns([c for c in small_eval_tok.column_names if c not in keep_cols])
148
+
149
+ # 5) Data collator with dynamic padding (CUDA gets pad_to_multiple_of=8)
150
+ import torch
151
+ pad_to_mult = 8 if (pad_to_multiple_of_8_on_cuda and torch.cuda.is_available()) else None
152
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=pad_to_mult)
153
+
154
+ return small_train_tok, small_eval_tok, data_collator, tokenizer
155
+
156
+
157
+ train_ds, eval_ds, data_collator, tokenizer = make_trainer_ready(
158
+ raw_ds=dataset,
159
+ model_name="cardiffnlp/twitter-roberta-base-sep2022",
160
+ train_frac=0.2, # take 20% of train
161
+ val_frac=0.5, # take 50% of validation
162
+ seed=42,
163
+ label_col="label",
164
+ text_col="text",
165
+ max_length=128,
166
+ )
167
+
168
+ # --- Training args: stop forking on macOS, fix pin_memory ---
169
+ trainer_fp16 = bool(device == "cuda" and use_fp16)
170
+ trainer_bf16 = bool(device == "cuda" and use_bf16)
171
+
172
+ training_args = TrainingArguments(
173
+ output_dir="models/artifacts",
174
+ learning_rate=1e-5,
175
+ per_device_train_batch_size=4,
176
+ per_device_eval_batch_size=8,
177
+ gradient_accumulation_steps=8,
178
+ num_train_epochs=3,
179
+ weight_decay=0.01,
180
+ warmup_ratio=0.1,
181
+ lr_scheduler_type="linear",
182
+
183
+ eval_strategy="steps",
184
+ logging_strategy="steps",
185
+ save_strategy="steps",
186
+ eval_steps=500,
187
+ logging_steps=100,
188
+ save_steps=500,
189
+
190
+ load_best_model_at_end=True,
191
+ metric_for_best_model="recall",
192
+ greater_is_better=True,
193
+ save_total_limit=2,
194
+
195
+ # Precision
196
+ fp16=trainer_fp16,
197
+ bf16=trainer_bf16,
198
+
199
+ # DataLoader knobs (avoid fork/tokenizers warning on macOS)
200
+ dataloader_num_workers=0, # <- key for macOS/MPS
201
+ dataloader_pin_memory=(device == "cuda"), # False on MPS/CPU, True on CUDA
202
+ group_by_length=True,
203
+ report_to="none",
204
+ )
205
+
206
+ # --- Metrics (macro recall, etc.) ---
207
+ recall_metric = evaluate.load("recall")
208
+ acc_metric = evaluate.load("accuracy")
209
+ f1_metric = evaluate.load("f1")
210
+
211
+ def compute_metrics(eval_pred):
212
+ logits, labels = eval_pred
213
+ preds = logits.argmax(axis=-1)
214
+ return {
215
+ "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
216
+ "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
217
+ "recall": recall_metric.compute(predictions=preds, references=labels, average="macro")["recall"],
218
+ }
219
+
220
+ callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
221
+
222
+
223
+ trainer = Trainer(
224
+ model=model,
225
+ args=training_args,
226
+ train_dataset= train_ds,
227
+ eval_dataset= eval_ds,
228
+ compute_metrics=compute_metrics,
229
+ data_collator=data_collator, # <- important
230
+ tokenizer=tokenizer,
231
+ callbacks=callbacks,
232
+ )
233
+
234
+ model.to(device)
235
+ trainer.train()
236
+ trainer.save_model("models/saved_model")
237
+ tokenizer.save_pretrained("models/saved_tokenizer")
238
+ try:
239
+ trainer.create_model_card()
240
+ except Exception:
241
+ pass