mjbommar commited on
Commit
9d1a15a
·
verified ·
1 Parent(s): 55d677f

Upload magic-bert-50m-roformer-classification model files

Browse files
README.md CHANGED
@@ -220,46 +220,15 @@ The model classifies files into 106 MIME types across these categories:
220
  ## How to Use
221
 
222
  ```python
223
- from transformers import RoFormerModel, AutoTokenizer
224
- from safetensors.torch import load_file
225
  import torch
226
- import torch.nn as nn
227
- import torch.nn.functional as F
228
- import json
229
 
230
- # Load tokenizer and MIME mapping
231
- tokenizer = AutoTokenizer.from_pretrained("path/to/magic-bert-50m-roformer-classification")
232
- with open("path/to/magic-bert-50m-roformer-classification/mime_type_mapping.json") as f:
233
- mime_mapping = json.load(f)
234
- id_to_mime = {int(k): v for k, v in mime_mapping.items()}
235
-
236
- # Load base model
237
- base_model = RoFormerModel.from_pretrained("path/to/magic-bert-50m-roformer-classification")
238
-
239
- # Create classification head
240
- class ClassificationHead(nn.Module):
241
- def __init__(self, hidden_size=512, projection_dim=256, num_classes=106):
242
- super().__init__()
243
- self.projection = nn.Sequential(
244
- nn.Linear(hidden_size, hidden_size),
245
- nn.ReLU(),
246
- nn.Linear(hidden_size, projection_dim),
247
- )
248
- self.classifier = nn.Linear(projection_dim, num_classes)
249
-
250
- def forward(self, hidden_states):
251
- pooled = hidden_states[:, 0, :] # CLS token
252
- projected = self.projection(pooled)
253
- projected = F.normalize(projected, p=2, dim=1)
254
- return self.classifier(projected), projected
255
-
256
- head = ClassificationHead()
257
- contrastive_dict = load_file("path/to/magic-bert-50m-roformer-classification/contrastive_head.safetensors")
258
- head.projection.load_state_dict({k.replace("projection.", ""): v for k, v in contrastive_dict.items() if "projection" in k})
259
- head.classifier.load_state_dict({k.replace("classifier.", ""): v for k, v in contrastive_dict.items() if "classifier" in k})
260
-
261
- base_model.eval()
262
- head.eval()
263
 
264
  # Classify a file
265
  with open("example.pdf", "rb") as f:
@@ -267,32 +236,40 @@ with open("example.pdf", "rb") as f:
267
 
268
  # Decode bytes to string using latin-1 (preserves all byte values 0-255)
269
  text = data.decode("latin-1")
270
-
271
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
272
 
273
  with torch.no_grad():
274
- outputs = base_model(**inputs)
275
- logits, embeddings = head(outputs.last_hidden_state)
276
- predicted_id = logits.argmax(-1).item()
277
 
278
- print(f"Predicted MIME type: {id_to_mime[predicted_id]}")
279
- print(f"Confidence: {F.softmax(logits, dim=-1).max().item():.2%}")
280
  ```
281
 
282
  ### Embedding-Based Similarity Search
283
 
284
  ```python
285
- # Get normalized embeddings for similarity search
286
  with torch.no_grad():
287
- outputs = base_model(**inputs)
288
- _, embeddings = head(outputs.last_hidden_state)
289
- # embeddings shape: [batch_size, 256], L2 normalized
290
 
291
  # Compute cosine similarity
292
  similarity = torch.mm(embeddings1, embeddings2.T)
 
 
 
293
 
294
- # Find most similar files
295
- top_k = similarity[0].topk(5)
 
 
 
 
 
 
 
296
  ```
297
 
298
  ## Limitations
 
220
  ## How to Use
221
 
222
  ```python
223
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
224
  import torch
 
 
 
225
 
226
+ model = AutoModelForSequenceClassification.from_pretrained(
227
+ "mjbommar/magic-bert-50m-roformer-classification", trust_remote_code=True
228
+ )
229
+ tokenizer = AutoTokenizer.from_pretrained("mjbommar/magic-bert-50m-roformer-classification")
230
+
231
+ model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  # Classify a file
234
  with open("example.pdf", "rb") as f:
 
236
 
237
  # Decode bytes to string using latin-1 (preserves all byte values 0-255)
238
  text = data.decode("latin-1")
 
239
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
240
 
241
  with torch.no_grad():
242
+ outputs = model(**inputs)
243
+ predicted_id = outputs.logits.argmax(-1).item()
244
+ confidence = torch.softmax(outputs.logits, dim=-1).max().item()
245
 
246
+ print(f"Predicted class: {predicted_id}")
247
+ print(f"Confidence: {confidence:.2%}")
248
  ```
249
 
250
  ### Embedding-Based Similarity Search
251
 
252
  ```python
253
+ # Get normalized embeddings (256-dim, L2-normalized)
254
  with torch.no_grad():
255
+ embeddings = model.get_embeddings(inputs["input_ids"], inputs["attention_mask"])
256
+ # embeddings shape: [batch_size, 256]
 
257
 
258
  # Compute cosine similarity
259
  similarity = torch.mm(embeddings1, embeddings2.T)
260
+ ```
261
+
262
+ ### Loading MIME Type Labels
263
 
264
+ ```python
265
+ from huggingface_hub import hf_hub_download
266
+ import json
267
+
268
+ mime_path = hf_hub_download("mjbommar/magic-bert-50m-roformer-classification", "mime_type_mapping.json")
269
+ with open(mime_path) as f:
270
+ id_to_mime = {int(k): v for k, v in json.load(f).items()}
271
+
272
+ print(f"Predicted MIME type: {id_to_mime[predicted_id]}")
273
  ```
274
 
275
  ## Limitations
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "architectures": [
3
- "RoFormerForSequenceClassification"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "embedding_size": 512,
@@ -21,5 +21,10 @@
21
  "use_cache": true,
22
  "vocab_size": 32768,
23
  "num_labels": 106,
24
- "problem_type": "single_label_classification"
 
 
 
 
 
25
  }
 
1
  {
2
  "architectures": [
3
+ "RoFormerForSequenceClassificationWithProjection"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "embedding_size": 512,
 
21
  "use_cache": true,
22
  "vocab_size": 32768,
23
  "num_labels": 106,
24
+ "problem_type": "single_label_classification",
25
+ "projection_dim": 256,
26
+ "auto_map": {
27
+ "AutoConfig": "configuration_roformer_classification.RoFormerClassificationConfig",
28
+ "AutoModelForSequenceClassification": "modeling_roformer_classification.RoFormerForSequenceClassificationWithProjection"
29
+ }
30
  }
configuration_roformer_classification.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RoFormer configuration for classification with projection head."""
2
+
3
+ from transformers import RoFormerConfig
4
+
5
+
6
+ class RoFormerClassificationConfig(RoFormerConfig):
7
+ """Configuration for RoFormer with contrastive projection head.
8
+
9
+ Extends RoFormerConfig with additional parameters for the projection head
10
+ used in contrastive learning for file type classification.
11
+ """
12
+
13
+ model_type = "roformer-classification"
14
+
15
+ def __init__(
16
+ self,
17
+ projection_dim: int = 256,
18
+ num_labels: int = 106,
19
+ **kwargs,
20
+ ):
21
+ """Initialize configuration.
22
+
23
+ Args:
24
+ projection_dim: Dimension of the projection head output (for embeddings)
25
+ num_labels: Number of classification labels (MIME types)
26
+ **kwargs: Additional arguments passed to RoFormerConfig
27
+ """
28
+ super().__init__(**kwargs)
29
+ self.projection_dim = projection_dim
30
+ self.num_labels = num_labels
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a3c0be25fef5e6e5da5c470a8feec34d20ad3a7467bdb3fb742fd521310b639
3
- size 169324736
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2909ecbd713ce578b9124b0bb09cd7ead8547cea2e3ccece2b25cbe528b1a3d1
3
+ size 169932352
modeling_roformer_classification.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RoFormer model with projection head for classification.
2
+
3
+ This module provides a RoFormer-based model with a projection head for
4
+ contrastive learning, enabling both classification and embedding-based
5
+ similarity search for file type detection.
6
+ """
7
+
8
+ from typing import Optional, Tuple, Union
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from transformers import RoFormerModel, RoFormerPreTrainedModel
14
+ from transformers.modeling_outputs import SequenceClassifierOutput
15
+
16
+ try:
17
+ from .configuration_roformer_classification import RoFormerClassificationConfig
18
+ except ImportError:
19
+ from configuration_roformer_classification import RoFormerClassificationConfig
20
+
21
+
22
+ class RoFormerForSequenceClassificationWithProjection(RoFormerPreTrainedModel):
23
+ """RoFormer with projection head for file type classification.
24
+
25
+ This model extends RoFormer with a projection head that produces
26
+ L2-normalized embeddings suitable for both classification and
27
+ similarity search. The architecture is:
28
+
29
+ RoFormer -> CLS pooling -> Projection -> L2 Norm -> Classifier
30
+
31
+ The projection head enables contrastive learning and produces
32
+ embeddings for similarity-based file type matching.
33
+ """
34
+
35
+ config_class = RoFormerClassificationConfig
36
+
37
+ def __init__(self, config: RoFormerClassificationConfig):
38
+ super().__init__(config)
39
+ self.num_labels = config.num_labels
40
+ self.projection_dim = getattr(config, "projection_dim", 256)
41
+
42
+ self.roformer = RoFormerModel(config)
43
+
44
+ # Projection head for contrastive learning embeddings
45
+ self.projection = nn.Sequential(
46
+ nn.Linear(config.hidden_size, config.hidden_size),
47
+ nn.ReLU(),
48
+ nn.Linear(config.hidden_size, self.projection_dim),
49
+ )
50
+
51
+ # Classifier on pooled output (hidden_size, not projection_dim)
52
+ # This architecture uses hidden representation for classification
53
+ # while projection is for embedding similarity search
54
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
55
+
56
+ self.post_init()
57
+
58
+ def forward(
59
+ self,
60
+ input_ids: Optional[torch.Tensor] = None,
61
+ attention_mask: Optional[torch.Tensor] = None,
62
+ token_type_ids: Optional[torch.Tensor] = None,
63
+ head_mask: Optional[torch.Tensor] = None,
64
+ inputs_embeds: Optional[torch.Tensor] = None,
65
+ labels: Optional[torch.Tensor] = None,
66
+ output_attentions: Optional[bool] = None,
67
+ output_hidden_states: Optional[bool] = None,
68
+ return_dict: Optional[bool] = None,
69
+ ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutput]:
70
+ """Forward pass for classification.
71
+
72
+ Args:
73
+ input_ids: Input token IDs [batch_size, seq_length]
74
+ attention_mask: Attention mask [batch_size, seq_length]
75
+ token_type_ids: Token type IDs (optional)
76
+ head_mask: Head mask for attention (optional)
77
+ inputs_embeds: Input embeddings (optional, alternative to input_ids)
78
+ labels: Labels for computing loss [batch_size]
79
+ output_attentions: Whether to return attention weights
80
+ output_hidden_states: Whether to return hidden states
81
+ return_dict: Whether to return a SequenceClassifierOutput
82
+
83
+ Returns:
84
+ SequenceClassifierOutput with loss, logits, and optional hidden states
85
+ """
86
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
87
+
88
+ outputs = self.roformer(
89
+ input_ids,
90
+ attention_mask=attention_mask,
91
+ token_type_ids=token_type_ids,
92
+ head_mask=head_mask,
93
+ inputs_embeds=inputs_embeds,
94
+ output_attentions=output_attentions,
95
+ output_hidden_states=output_hidden_states,
96
+ return_dict=return_dict,
97
+ )
98
+
99
+ # Pool using CLS token
100
+ sequence_output = outputs[0]
101
+ pooled_output = sequence_output[:, 0, :]
102
+
103
+ # Classify from pooled output directly
104
+ logits = self.classifier(pooled_output)
105
+
106
+ loss = None
107
+ if labels is not None:
108
+ loss_fct = nn.CrossEntropyLoss()
109
+ loss = loss_fct(logits, labels)
110
+
111
+ if not return_dict:
112
+ output = (logits,) + outputs[2:]
113
+ return ((loss,) + output) if loss is not None else output
114
+
115
+ return SequenceClassifierOutput(
116
+ loss=loss,
117
+ logits=logits,
118
+ hidden_states=outputs.hidden_states,
119
+ attentions=outputs.attentions,
120
+ )
121
+
122
+ def get_embeddings(
123
+ self,
124
+ input_ids: Optional[torch.Tensor] = None,
125
+ attention_mask: Optional[torch.Tensor] = None,
126
+ token_type_ids: Optional[torch.Tensor] = None,
127
+ ) -> torch.Tensor:
128
+ """Get normalized projection embeddings for similarity search.
129
+
130
+ Args:
131
+ input_ids: Input token IDs [batch_size, seq_length]
132
+ attention_mask: Attention mask [batch_size, seq_length]
133
+ token_type_ids: Token type IDs (optional)
134
+
135
+ Returns:
136
+ L2-normalized embeddings [batch_size, projection_dim]
137
+ """
138
+ outputs = self.roformer(
139
+ input_ids,
140
+ attention_mask=attention_mask,
141
+ token_type_ids=token_type_ids,
142
+ return_dict=True,
143
+ )
144
+
145
+ pooled_output = outputs.last_hidden_state[:, 0, :]
146
+ projections = self.projection(pooled_output)
147
+ return F.normalize(projections, p=2, dim=1)