File size: 3,551 Bytes
a1bbbd5
 
 
 
 
 
 
 
 
 
 
 
 
0420d4e
a1bbbd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# app.py
import streamlit as st
import joblib, json, numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

ART = Path("artifacts")
LABELS = json.load(open(ART/"label_names.json"))
EMB_MODEL_NAME = (ART/"emb_model_name.txt").read_text().strip()

@st.cache_resource(show_spinner=False)
def load_models():
    emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    clf = joblib.load(ART/"lgbm_model.pkl")
    nn = joblib.load(ART/"nn_index.pkl")
    tfidf = joblib.load(ART/"tfidf_explainer.pkl")
    train_meta = json.load(open(ART/"train_meta.json"))
    class_keywords = json.load(open(ART/"class_keywords.json"))
    return emb, clf, nn, tfidf, train_meta, class_keywords

def encode_one(emb_model, text: str) -> np.ndarray:
    text = text.strip()
    prompt = f"passage: {text}"
    v = emb_model.encode([prompt], show_progress_bar=False, normalize_embeddings=True)
    return np.asarray(v, dtype=np.float32)

st.set_page_config(page_title="ArXiv Abstract Classifier", page_icon="🧠", layout="wide")
st.title("🧠 ArXiv Abstract Classifier")
st.caption("Embeddings (E5) + LightGBM • Probabilities • Similar Papers • Class Keywords")

with st.sidebar:
    st.markdown("### Settings")
    topk = st.slider("Top similar papers", 1, 10, 3)
    show_keywords = st.checkbox("Show class keywords", value=True)
    st.divider()
    st.markdown("Model")
    st.code(f"Encoder: {EMB_MODEL_NAME}\nClassifier: LightGBM", language="yaml")

emb_model, clf, nn, tfidf, train_meta, class_keywords = load_models()

default_text = """We propose a novel neural architecture for efficient transformer inference,
reducing memory footprint while maintaining accuracy on common NLP tasks. 
Experiments on translation and summarization demonstrate competitive results."""
text = st.text_area("Paste paper abstract here:", default_text, height=220)

col1, col2 = st.columns([1,1])
with col1:
    run = st.button("🔍 Classify")
with col2:
    clear = st.button("🧹 Clear")
    if clear:
        st.experimental_rerun()

if run:
    if not text.strip():
        st.warning("Please enter an abstract.")
        st.stop()

    v = encode_one(emb_model, text)
    probs = clf.predict_proba(v)[0]
    pred_idx = int(np.argmax(probs))
    pred_label = LABELS[pred_idx]

    st.success(f"**Predicted field:** `{pred_label}`")
    st.write("### Class probabilities")
    prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
    st.bar_chart(prob_dict)

    st.write("### 🔗 Most similar training papers")
    dists, idxs = nn.kneighbors(v, n_neighbors=max(topk, 3), return_distance=True)
    idxs = idxs[0].tolist()
    dists = dists[0].tolist()

    titles = train_meta["train_titles"]
    abstracts = train_meta["train_abstracts"]
    labels = train_meta["train_labels"]

    for rank, (i, d) in enumerate(zip(idxs[:topk], dists[:topk]), start=1):
        cos = 1 - d
        with st.container(border=True):
            st.markdown(f"**#{rank}. {titles[i]}**")
            st.caption(f"_Label:_ `{labels[i]}` • _Cosine similarity:_ **{cos:.3f}**")
            st.write(abstracts[i][:600] + ("..." if len(abstracts[i]) > 600 else ""))

    if show_keywords:
        st.write("### 🏷️ Class keywords (TF-IDF centroids)")
        cols = st.columns(len(LABELS))
        for j, lb in enumerate(LABELS):
            with cols[j]:
                st.markdown(f"**{lb}**")
                st.write(", ".join(class_keywords.get(lb, [])[:15]))