Spaces:

NLarchive
/

mcp-semantic-keywords

Sleeping

App Files Files Community

NLarchive commited on Jun 10

Commit

1d26c01

verified ·

1 Parent(s): c179fc0

Create app.py

Browse files

Files changed (1) hide show

app.py +313 -0

app.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import gradio as gr
+from typing import Dict, List, Union
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import re
+from collections import Counter
+# Initialize lightweight embedding model
+model = SentenceTransformer("all-MiniLM-L6-v2")
+def semantic_similarity(text1: str, text2: str) -> Dict[str, Union[float, str]]:
+    """
+    Calculate semantic similarity between two texts using embeddings.
+    Args:
+        text1 (str): First text to compare
+        text2 (str): Second text to compare
+    Returns:
+        dict: Similarity score and analysis between the two texts
+    """
+    if not text1.strip() or not text2.strip():
+        return {
+            "similarity_score": 0.0,
+            "analysis": "empty text provided",
+            "status": "error"
+        }
+    try:
+        # Generate embeddings
+        embeddings = model.encode([text1, text2])
+        # Calculate cosine similarity
+        similarity = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]
+        # Analysis based on similarity score
+        if similarity >= 0.8:
+            analysis = "very similar"
+        elif similarity >= 0.6:
+            analysis = "similar"
+        elif similarity >= 0.4:
+            analysis = "somewhat related"
+        elif similarity >= 0.2:
+            analysis = "slightly related"
+        else:
+            analysis = "not related"
+        return {
+            "similarity_score": round(float(similarity), 4),
+            "analysis": analysis,
+            "status": "success",
+            "text1_length": len(text1),
+            "text2_length": len(text2)
+        }
+    except Exception as e:
+        return {
+            "similarity_score": 0.0,
+            "analysis": f"error: {str(e)}",
+            "status": "error"
+        }
+def find_similar_sentences(query: str, document: str, top_k: int = 3) -> Dict[str, Union[List, str, int]]:
+    """
+    Find the most semantically similar sentences in a document to a query.
+    Args:
+        query (str): Search query
+        document (str): Document to search within
+        top_k (int): Number of top similar sentences to return
+    Returns:
+        dict: Most similar sentences with similarity scores
+    """
+    if not query.strip() or not document.strip():
+        return {
+            "status": "error",
+            "message": "Query and document cannot be empty",
+            "results": []
+        }
+    try:        # Split document into sentences
+        sentences = re.split(r'[.!?]+', document)
+        sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
+        if not sentences:
+            return {
+                "status": "error",
+                "message": "No valid sentences found in document",
+                "results": []
+            }
+        # Generate embeddings
+        query_embedding = model.encode([query])
+        sentence_embeddings = model.encode(sentences)
+        # Calculate similarities
+        similarities = cosine_similarity(query_embedding, sentence_embeddings)[0]
+        # Get top-k results
+        top_indices = np.argsort(similarities)[::-1][:top_k]
+        results = []
+        for i, idx in enumerate(top_indices):
+            results.append({
+                "rank": i + 1,
+                "similarity_score": round(float(similarities[idx]), 4),
+                "sentence": sentences[idx],
+                "sentence_length": len(sentences[idx])
+            })
+        return {
+            "status": "success",
+            "message": f"Found {len(results)} similar sentences",
+            "results": results,
+            "total_sentences": len(sentences),
+            "query": query
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "message": f"Error: {str(e)}",
+            "results": []
+        }
+def extract_semantic_keywords(text: str, max_keywords: int = 10) -> Dict[str, Union[List, str, int]]:
+    """
+    Extract keywords using TF-IDF and semantic analysis.
+    Args:
+        text (str): Text to extract keywords from
+        max_keywords (int): Maximum number of keywords to extract
+    Returns:
+        dict: Extracted keywords with relevance scores
+    """
+    if not text.strip():
+        return {
+            "status": "error",
+            "message": "Text cannot be empty",
+            "keywords": []
+        }
+    try:
+        # Clean and tokenize
+        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
+        # Stop words
+        stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these', 'those', 'is', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'must', 'shall', 'you', 'your', 'yours', 'yourself', 'yourselves'}
+        # Filter out stop words and short words
+        filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
+        # Count frequencies
+        word_freq = Counter(filtered_words)
+        # Get top words by frequency
+        top_words = word_freq.most_common(max_keywords * 2)  # Get more for filtering
+        # Calculate relevance scores (simple TF)
+        total_words = len(filtered_words)
+        keywords = []
+        for word, freq in top_words[:max_keywords]:
+            relevance = freq / total_words
+            keywords.append({
+                "keyword": word,
+                "frequency": freq,
+                "relevance_score": round(relevance, 4),
+                "tf_score": round(freq / total_words * 100, 2)  # Term frequency as percentage
+            })
+        return {
+            "status": "success",
+            "message": f"Extracted {len(keywords)} keywords",
+            "keywords": keywords,
+            "total_words": total_words,
+            "unique_words": len(word_freq),
+            "text_length": len(text)
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "message": f"Error: {str(e)}",
+            "keywords": []
+        }
+def semantic_search_in_text(query: str, documents_text: str, max_results: int = 5) -> Dict[str, Union[List, str, int]]:
+    """
+    Search for semantically similar content within provided text documents.
+    Args:
+        query (str): Search query
+        documents_text (str): Multiple documents separated by newlines or paragraphs
+        max_results (int): Maximum number of results to return
+    Returns:
+        dict: Search results with similarity scores
+    """
+    if not query.strip() or not documents_text.strip():
+        return {
+            "status": "error",
+            "message": "Query and documents cannot be empty",
+            "results": []
+        }
+    try:
+        # Split into paragraphs/documents
+        paragraphs = [p.strip() for p in documents_text.split('\n\n') if p.strip() and len(p.strip()) > 20]
+        if not paragraphs:
+            # Fall back to splitting by single newlines
+            paragraphs = [p.strip() for p in documents_text.split('\n') if p.strip() and len(p.strip()) > 20]
+        if not paragraphs:
+            return {
+                "status": "error",
+                "message": "No valid paragraphs found in documents",
+                "results": []
+            }
+        # Generate embeddings
+        query_embedding = model.encode([query])
+        paragraph_embeddings = model.encode(paragraphs)
+        # Calculate similarities
+        similarities = cosine_similarity(query_embedding, paragraph_embeddings)[0]
+        # Get top results
+        top_indices = np.argsort(similarities)[::-1][:max_results]
+        results = []
+        for i, idx in enumerate(top_indices):
+            results.append({
+                "rank": i + 1,
+                "similarity_score": round(float(similarities[idx]), 4),
+                "content": paragraphs[idx],
+                "content_length": len(paragraphs[idx])
+            })
+        return {
+            "status": "success",
+            "message": f"Found {len(results)} relevant paragraphs",
+            "results": results,
+            "total_documents": len(paragraphs),
+            "query": query
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "message": f"Error: {str(e)}",
+            "results": []
+        }
+# Create Gradio interfaces
+demo_similarity = gr.Interface(
+    fn=semantic_similarity,
+    inputs=[
+        gr.Textbox(placeholder="Enter first text...", label="Text 1", lines=3, value="I love machine learning and AI"),
+        gr.Textbox(placeholder="Enter second text...", label="Text 2", lines=3, value="Artificial intelligence and ML are fascinating")
+    ],
+    outputs=gr.JSON(),
+    title="🔗 Semantic Similarity",
+    description="Calculate semantic similarity between two texts using embeddings"
+)
+demo_find_similar = gr.Interface(
+    fn=find_similar_sentences,
+    inputs=[
+        gr.Textbox(placeholder="Search query...", label="Query", value="machine learning"),
+        gr.Textbox(placeholder="Document text...", label="Document", lines=5, value="Machine learning is a subset of AI. Deep learning uses neural networks. Natural language processing handles text."),
+        gr.Slider(1, 10, value=3, label="Number of Results")
+    ],
+    outputs=gr.JSON(),
+    title="🎯 Find Similar Sentences",
+    description="Find sentences in a document most similar to your query"
+)
+demo_keywords = gr.Interface(
+    fn=extract_semantic_keywords,
+    inputs=[
+        gr.Textbox(placeholder="Text to extract keywords from...", label="Text", lines=5, value="Machine learning and artificial intelligence are transforming technology"),
+        gr.Slider(1, 20, value=10, label="Max Keywords")
+    ],
+    outputs=gr.JSON(),
+    title="🏷️ Keyword Extraction",
+    description="Extract relevant keywords and phrases from text"
+)
+demo_search = gr.Interface(
+    fn=semantic_search_in_text,
+    inputs=[
+        gr.Textbox(placeholder="Search query...", label="Search Query", value="neural networks"),
+        gr.Textbox(placeholder="Documents (separated by empty lines)...", label="Documents", lines=8, value="Deep learning uses neural networks.\n\nMachine learning algorithms learn patterns.\n\nAI systems can process natural language."),
+        gr.Slider(1, 10, value=5, label="Max Results")
+    ],
+    outputs=gr.JSON(),
+    title="🔍 Semantic Text Search",
+    description="Search for relevant content within provided documents using semantic similarity"
+)
+# Combine all interfaces
+demo = gr.TabbedInterface(
+    [demo_similarity, demo_find_similar, demo_keywords, demo_search],
+    ["Similarity", "Find Sentences", "Keywords", "Search in Text"],
+    title="🧠 Semantic Analysis Suite (Stateless)"
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)