BlameGame
Collection
5 items
โข
Updated
NewsBERT is a domain-adapted masked language model based on google-bert/bert-base-uncased. It has been fine-tuned with a masked language modeling (MLM) objective on all historical English newspaper text from the following two collections:
NewsBERT retains the architecture and vocabulary of BERT-base (uncased), with only weights being adapted to these datasets.
BertForMaskedLMgoogle-bert/bert-base-uncasedfrom transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
model_id = "npedrazzini/NewsBERT"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
text = "The [MASK] was published in the newspaper."
preds = fill_mask(text)
for p in preds:
print(f"{p['sequence']} (score={p['score']:.4f})")
import torch
from transformers import AutoTokenizer, AutoModel
model_id = "npedrazzini/NewsBERT"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()
def encode(text, max_length=512):
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=max_length
).to(device)
with torch.no_grad():
outputs = model(**inputs)
embedding = outputs.last_hidden_state[:, 0, :] # CLS token
return embedding.squeeze(0).cpu() # [768]
embedding = encode("Example newspaper article text...")
print(embedding.shape) # torch.Size([768])
import torch.nn.functional as F
e1 = encode("Article text one...")
e2 = encode("Another article...")
cos_sim = F.cosine_similarity(e1, e2, dim=0)
print("Cosine similarity:", cos_sim.item())