Upload magic-bert-50m-roformer-classification model files

Browse files

Files changed (5) hide show

README.md +27 -50
config.json +7 -2
configuration_roformer_classification.py +30 -0
model.safetensors +2 -2
modeling_roformer_classification.py +147 -0

README.md CHANGED Viewed

@@ -220,46 +220,15 @@ The model classifies files into 106 MIME types across these categories:
 ## How to Use
 ```python
-from transformers import RoFormerModel, AutoTokenizer
-from safetensors.torch import load_file
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import json
-# Load tokenizer and MIME mapping
-tokenizer = AutoTokenizer.from_pretrained("path/to/magic-bert-50m-roformer-classification")
-with open("path/to/magic-bert-50m-roformer-classification/mime_type_mapping.json") as f:
-    mime_mapping = json.load(f)
-id_to_mime = {int(k): v for k, v in mime_mapping.items()}
-# Load base model
-base_model = RoFormerModel.from_pretrained("path/to/magic-bert-50m-roformer-classification")
-# Create classification head
-class ClassificationHead(nn.Module):
-    def __init__(self, hidden_size=512, projection_dim=256, num_classes=106):
-        super().__init__()
-        self.projection = nn.Sequential(
-            nn.Linear(hidden_size, hidden_size),
-            nn.ReLU(),
-            nn.Linear(hidden_size, projection_dim),
-        )
-        self.classifier = nn.Linear(projection_dim, num_classes)
-    def forward(self, hidden_states):
-        pooled = hidden_states[:, 0, :]  # CLS token
-        projected = self.projection(pooled)
-        projected = F.normalize(projected, p=2, dim=1)
-        return self.classifier(projected), projected
-head = ClassificationHead()
-contrastive_dict = load_file("path/to/magic-bert-50m-roformer-classification/contrastive_head.safetensors")
-head.projection.load_state_dict({k.replace("projection.", ""): v for k, v in contrastive_dict.items() if "projection" in k})
-head.classifier.load_state_dict({k.replace("classifier.", ""): v for k, v in contrastive_dict.items() if "classifier" in k})
-base_model.eval()
-head.eval()
 # Classify a file
 with open("example.pdf", "rb") as f:
@@ -267,32 +236,40 @@ with open("example.pdf", "rb") as f:
 # Decode bytes to string using latin-1 (preserves all byte values 0-255)
 text = data.decode("latin-1")
 inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
 with torch.no_grad():
-    outputs = base_model(**inputs)
-    logits, embeddings = head(outputs.last_hidden_state)
-    predicted_id = logits.argmax(-1).item()
-print(f"Predicted MIME type: {id_to_mime[predicted_id]}")
-print(f"Confidence: {F.softmax(logits, dim=-1).max().item():.2%}")
 ```
 ### Embedding-Based Similarity Search
 ```python
-# Get normalized embeddings for similarity search
 with torch.no_grad():
-    outputs = base_model(**inputs)
-    _, embeddings = head(outputs.last_hidden_state)
-    # embeddings shape: [batch_size, 256], L2 normalized
 # Compute cosine similarity
 similarity = torch.mm(embeddings1, embeddings2.T)
-# Find most similar files
-top_k = similarity[0].topk(5)
 ```
 ## Limitations

 ## How to Use
 ```python
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import torch
+model = AutoModelForSequenceClassification.from_pretrained(
+    "mjbommar/magic-bert-50m-roformer-classification", trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained("mjbommar/magic-bert-50m-roformer-classification")
+model.eval()
 # Classify a file
 with open("example.pdf", "rb") as f:
 # Decode bytes to string using latin-1 (preserves all byte values 0-255)
 text = data.decode("latin-1")
 inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
 with torch.no_grad():
+    outputs = model(**inputs)
+    predicted_id = outputs.logits.argmax(-1).item()
+    confidence = torch.softmax(outputs.logits, dim=-1).max().item()
+print(f"Predicted class: {predicted_id}")
+print(f"Confidence: {confidence:.2%}")
 ```
 ### Embedding-Based Similarity Search
 ```python
+# Get normalized embeddings (256-dim, L2-normalized)
 with torch.no_grad():
+    embeddings = model.get_embeddings(inputs["input_ids"], inputs["attention_mask"])
+    # embeddings shape: [batch_size, 256]
 # Compute cosine similarity
 similarity = torch.mm(embeddings1, embeddings2.T)
+```
+### Loading MIME Type Labels
+```python
+from huggingface_hub import hf_hub_download
+import json
+mime_path = hf_hub_download("mjbommar/magic-bert-50m-roformer-classification", "mime_type_mapping.json")
+with open(mime_path) as f:
+    id_to_mime = {int(k): v for k, v in json.load(f).items()}
+print(f"Predicted MIME type: {id_to_mime[predicted_id]}")
 ```
 ## Limitations

config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "RoFormerForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "embedding_size": 512,
@@ -21,5 +21,10 @@
   "use_cache": true,
   "vocab_size": 32768,
   "num_labels": 106,
-  "problem_type": "single_label_classification"
 }

 {
   "architectures": [
+    "RoFormerForSequenceClassificationWithProjection"
   ],
   "attention_probs_dropout_prob": 0.1,
   "embedding_size": 512,
   "use_cache": true,
   "vocab_size": 32768,
   "num_labels": 106,
+  "problem_type": "single_label_classification",
+  "projection_dim": 256,
+  "auto_map": {
+    "AutoConfig": "configuration_roformer_classification.RoFormerClassificationConfig",
+    "AutoModelForSequenceClassification": "modeling_roformer_classification.RoFormerForSequenceClassificationWithProjection"
+  }
 }

configuration_roformer_classification.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""RoFormer configuration for classification with projection head."""
+from transformers import RoFormerConfig
+class RoFormerClassificationConfig(RoFormerConfig):
+    """Configuration for RoFormer with contrastive projection head.
+    Extends RoFormerConfig with additional parameters for the projection head
+    used in contrastive learning for file type classification.
+    """
+    model_type = "roformer-classification"
+    def __init__(
+        self,
+        projection_dim: int = 256,
+        num_labels: int = 106,
+        **kwargs,
+    ):
+        """Initialize configuration.
+        Args:
+            projection_dim: Dimension of the projection head output (for embeddings)
+            num_labels: Number of classification labels (MIME types)
+            **kwargs: Additional arguments passed to RoFormerConfig
+        """
+        super().__init__(**kwargs)
+        self.projection_dim = projection_dim
+        self.num_labels = num_labels

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a3c0be25fef5e6e5da5c470a8feec34d20ad3a7467bdb3fb742fd521310b639
-size 169324736

 version https://git-lfs.github.com/spec/v1
+oid sha256:2909ecbd713ce578b9124b0bb09cd7ead8547cea2e3ccece2b25cbe528b1a3d1
+size 169932352

modeling_roformer_classification.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""RoFormer model with projection head for classification.
+This module provides a RoFormer-based model with a projection head for
+contrastive learning, enabling both classification and embedding-based
+similarity search for file type detection.
+"""
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import RoFormerModel, RoFormerPreTrainedModel
+from transformers.modeling_outputs import SequenceClassifierOutput
+try:
+    from .configuration_roformer_classification import RoFormerClassificationConfig
+except ImportError:
+    from configuration_roformer_classification import RoFormerClassificationConfig
+class RoFormerForSequenceClassificationWithProjection(RoFormerPreTrainedModel):
+    """RoFormer with projection head for file type classification.
+    This model extends RoFormer with a projection head that produces
+    L2-normalized embeddings suitable for both classification and
+    similarity search. The architecture is:
+        RoFormer -> CLS pooling -> Projection -> L2 Norm -> Classifier
+    The projection head enables contrastive learning and produces
+    embeddings for similarity-based file type matching.
+    """
+    config_class = RoFormerClassificationConfig
+    def __init__(self, config: RoFormerClassificationConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.projection_dim = getattr(config, "projection_dim", 256)
+        self.roformer = RoFormerModel(config)
+        # Projection head for contrastive learning embeddings
+        self.projection = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.ReLU(),
+            nn.Linear(config.hidden_size, self.projection_dim),
+        )
+        # Classifier on pooled output (hidden_size, not projection_dim)
+        # This architecture uses hidden representation for classification
+        # while projection is for embedding similarity search
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutput]:
+        """Forward pass for classification.
+        Args:
+            input_ids: Input token IDs [batch_size, seq_length]
+            attention_mask: Attention mask [batch_size, seq_length]
+            token_type_ids: Token type IDs (optional)
+            head_mask: Head mask for attention (optional)
+            inputs_embeds: Input embeddings (optional, alternative to input_ids)
+            labels: Labels for computing loss [batch_size]
+            output_attentions: Whether to return attention weights
+            output_hidden_states: Whether to return hidden states
+            return_dict: Whether to return a SequenceClassifierOutput
+        Returns:
+            SequenceClassifierOutput with loss, logits, and optional hidden states
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # Pool using CLS token
+        sequence_output = outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+        # Classify from pooled output directly
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def get_embeddings(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Get normalized projection embeddings for similarity search.
+        Args:
+            input_ids: Input token IDs [batch_size, seq_length]
+            attention_mask: Attention mask [batch_size, seq_length]
+            token_type_ids: Token type IDs (optional)
+        Returns:
+            L2-normalized embeddings [batch_size, projection_dim]
+        """
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            return_dict=True,
+        )
+        pooled_output = outputs.last_hidden_state[:, 0, :]
+        projections = self.projection(pooled_output)
+        return F.normalize(projections, p=2, dim=1)