Spaces:

AB498
/

codebert-base-mlm

Sleeping

8a48ded about 1 month ago

4 kB

	import gradio as gr
	from transformers import RobertaTokenizer, RobertaForMaskedLM
	import torch

	# Load CodeBERT model and tokenizer
	model_name = "microsoft/codebert-base-mlm"
	tokenizer = RobertaTokenizer.from_pretrained(model_name)
	model = RobertaForMaskedLM.from_pretrained(model_name)

	def predict(code, num_predictions=5):
	"""
	Predict the masked token in code.
	Use <mask> to indicate where to predict.

	Args:
	code: Code snippet with <mask> token
	num_predictions: Number of top predictions to return

	Returns:
	JSON object with predictions
	"""
	try:
	# Replace <mask> with the tokenizer's mask token
	code_input = code.replace("<mask>", tokenizer.mask_token)

	# Tokenize input
	inputs = tokenizer(code_input, return_tensors="pt")

	# Find the position of the mask token
	mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

	if len(mask_token_index) == 0:
	return {
	"error": "No <mask> token found in the input. Please include <mask> where you want predictions.",
	"predictions": []
	}

	# Get predictions
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits

	# Get top-k predictions for the mask token
	mask_token_logits = logits[0, mask_token_index, :]
	top_tokens = torch.topk(mask_token_logits, num_predictions, dim=1)

	predictions = []
	for rank, (token_id, score) in enumerate(zip(top_tokens.indices[0].tolist(), top_tokens.values[0].tolist()), 1):
	predicted_token = tokenizer.decode([token_id])
	completed_code = code_input.replace(tokenizer.mask_token, predicted_token)

	predictions.append({
	"rank": rank,
	"token": predicted_token,
	"score": round(float(score), 4),
	"completed_code": completed_code
	})

	return {
	"original_code": code,
	"predictions": predictions
	}

	except Exception as e:
	return {
	"error": str(e),
	"predictions": []
	}

	# Create Gradio interface
	with gr.Blocks(title="CodeBERT Masked Language Model") as demo:
	gr.Markdown(
	"""
	# CodeBERT Masked Language Model

	This model predicts masked tokens in code. Use `<mask>` to indicate where you want predictions.

	### Examples:
	- `def <mask>(x, y): return x + y`
	- `import <mask>`
	- `for i in <mask>(10):`
	- `x = [1, 2, 3]; y = x.<mask>()`
	"""
	)

	with gr.Row():
	with gr.Column():
	code_input = gr.Textbox(
	label="Code with <mask>",
	placeholder="Enter code with <mask> token...",
	lines=5,
	value="def <mask>(x, y):\n return x + y"
	)
	num_predictions_slider = gr.Slider(
	minimum=1,
	maximum=10,
	value=5,
	step=1,
	label="Number of predictions"
	)
	predict_btn = gr.Button("Predict", variant="primary")

	with gr.Column():
	output = gr.JSON(
	label="Predictions"
	)

	# Examples
	gr.Examples(
	examples=[
	["def <mask>(x, y):\n return x + y", 5],
	["import <mask>", 5],
	["for i in <mask>(10):", 5],
	["x = [1, 2, 3]\ny = x.<mask>()", 5],
	["if x <mask> 0:", 5],
	["class <mask>:", 5],
	],
	inputs=[code_input, num_predictions_slider],
	)

	predict_btn.click(
	fn=predict,
	inputs=[code_input, num_predictions_slider],
	outputs=output
	)

	if __name__ == "__main__":
	demo.launch()