NLarchive commited on
Commit
1d26c01
·
verified ·
1 Parent(s): c179fc0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +313 -0
app.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import Dict, List, Union
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import re
7
+ from collections import Counter
8
+
9
+ # Initialize lightweight embedding model
10
+ model = SentenceTransformer("all-MiniLM-L6-v2")
11
+
12
+ def semantic_similarity(text1: str, text2: str) -> Dict[str, Union[float, str]]:
13
+ """
14
+ Calculate semantic similarity between two texts using embeddings.
15
+
16
+ Args:
17
+ text1 (str): First text to compare
18
+ text2 (str): Second text to compare
19
+
20
+ Returns:
21
+ dict: Similarity score and analysis between the two texts
22
+ """
23
+ if not text1.strip() or not text2.strip():
24
+ return {
25
+ "similarity_score": 0.0,
26
+ "analysis": "empty text provided",
27
+ "status": "error"
28
+ }
29
+
30
+ try:
31
+ # Generate embeddings
32
+ embeddings = model.encode([text1, text2])
33
+
34
+ # Calculate cosine similarity
35
+ similarity = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]
36
+ # Analysis based on similarity score
37
+ if similarity >= 0.8:
38
+ analysis = "very similar"
39
+ elif similarity >= 0.6:
40
+ analysis = "similar"
41
+ elif similarity >= 0.4:
42
+ analysis = "somewhat related"
43
+ elif similarity >= 0.2:
44
+ analysis = "slightly related"
45
+ else:
46
+ analysis = "not related"
47
+
48
+ return {
49
+ "similarity_score": round(float(similarity), 4),
50
+ "analysis": analysis,
51
+ "status": "success",
52
+ "text1_length": len(text1),
53
+ "text2_length": len(text2)
54
+ }
55
+
56
+ except Exception as e:
57
+ return {
58
+ "similarity_score": 0.0,
59
+ "analysis": f"error: {str(e)}",
60
+ "status": "error"
61
+ }
62
+
63
+ def find_similar_sentences(query: str, document: str, top_k: int = 3) -> Dict[str, Union[List, str, int]]:
64
+ """
65
+ Find the most semantically similar sentences in a document to a query.
66
+
67
+ Args:
68
+ query (str): Search query
69
+ document (str): Document to search within
70
+ top_k (int): Number of top similar sentences to return
71
+
72
+ Returns:
73
+ dict: Most similar sentences with similarity scores
74
+ """
75
+ if not query.strip() or not document.strip():
76
+ return {
77
+ "status": "error",
78
+ "message": "Query and document cannot be empty",
79
+ "results": []
80
+ }
81
+
82
+ try: # Split document into sentences
83
+ sentences = re.split(r'[.!?]+', document)
84
+ sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
85
+
86
+ if not sentences:
87
+ return {
88
+ "status": "error",
89
+ "message": "No valid sentences found in document",
90
+ "results": []
91
+ }
92
+
93
+ # Generate embeddings
94
+ query_embedding = model.encode([query])
95
+ sentence_embeddings = model.encode(sentences)
96
+
97
+ # Calculate similarities
98
+ similarities = cosine_similarity(query_embedding, sentence_embeddings)[0]
99
+
100
+ # Get top-k results
101
+ top_indices = np.argsort(similarities)[::-1][:top_k]
102
+
103
+ results = []
104
+ for i, idx in enumerate(top_indices):
105
+ results.append({
106
+ "rank": i + 1,
107
+ "similarity_score": round(float(similarities[idx]), 4),
108
+ "sentence": sentences[idx],
109
+ "sentence_length": len(sentences[idx])
110
+ })
111
+
112
+ return {
113
+ "status": "success",
114
+ "message": f"Found {len(results)} similar sentences",
115
+ "results": results,
116
+ "total_sentences": len(sentences),
117
+ "query": query
118
+ }
119
+
120
+ except Exception as e:
121
+ return {
122
+ "status": "error",
123
+ "message": f"Error: {str(e)}",
124
+ "results": []
125
+ }
126
+
127
+ def extract_semantic_keywords(text: str, max_keywords: int = 10) -> Dict[str, Union[List, str, int]]:
128
+ """
129
+ Extract keywords using TF-IDF and semantic analysis.
130
+
131
+ Args:
132
+ text (str): Text to extract keywords from
133
+ max_keywords (int): Maximum number of keywords to extract
134
+
135
+ Returns:
136
+ dict: Extracted keywords with relevance scores
137
+ """
138
+ if not text.strip():
139
+ return {
140
+ "status": "error",
141
+ "message": "Text cannot be empty",
142
+ "keywords": []
143
+ }
144
+
145
+ try:
146
+ # Clean and tokenize
147
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
148
+
149
+ # Stop words
150
+ stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these', 'those', 'is', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'must', 'shall', 'you', 'your', 'yours', 'yourself', 'yourselves'}
151
+
152
+ # Filter out stop words and short words
153
+ filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
154
+
155
+ # Count frequencies
156
+ word_freq = Counter(filtered_words)
157
+
158
+ # Get top words by frequency
159
+ top_words = word_freq.most_common(max_keywords * 2) # Get more for filtering
160
+
161
+ # Calculate relevance scores (simple TF)
162
+ total_words = len(filtered_words)
163
+ keywords = []
164
+
165
+ for word, freq in top_words[:max_keywords]:
166
+ relevance = freq / total_words
167
+ keywords.append({
168
+ "keyword": word,
169
+ "frequency": freq,
170
+ "relevance_score": round(relevance, 4),
171
+ "tf_score": round(freq / total_words * 100, 2) # Term frequency as percentage
172
+ })
173
+
174
+ return {
175
+ "status": "success",
176
+ "message": f"Extracted {len(keywords)} keywords",
177
+ "keywords": keywords,
178
+ "total_words": total_words,
179
+ "unique_words": len(word_freq),
180
+ "text_length": len(text)
181
+ }
182
+
183
+ except Exception as e:
184
+ return {
185
+ "status": "error",
186
+ "message": f"Error: {str(e)}",
187
+ "keywords": []
188
+ }
189
+
190
+ def semantic_search_in_text(query: str, documents_text: str, max_results: int = 5) -> Dict[str, Union[List, str, int]]:
191
+ """
192
+ Search for semantically similar content within provided text documents.
193
+
194
+ Args:
195
+ query (str): Search query
196
+ documents_text (str): Multiple documents separated by newlines or paragraphs
197
+ max_results (int): Maximum number of results to return
198
+
199
+ Returns:
200
+ dict: Search results with similarity scores
201
+ """
202
+ if not query.strip() or not documents_text.strip():
203
+ return {
204
+ "status": "error",
205
+ "message": "Query and documents cannot be empty",
206
+ "results": []
207
+ }
208
+
209
+ try:
210
+ # Split into paragraphs/documents
211
+ paragraphs = [p.strip() for p in documents_text.split('\n\n') if p.strip() and len(p.strip()) > 20]
212
+
213
+ if not paragraphs:
214
+ # Fall back to splitting by single newlines
215
+ paragraphs = [p.strip() for p in documents_text.split('\n') if p.strip() and len(p.strip()) > 20]
216
+
217
+ if not paragraphs:
218
+ return {
219
+ "status": "error",
220
+ "message": "No valid paragraphs found in documents",
221
+ "results": []
222
+ }
223
+
224
+ # Generate embeddings
225
+ query_embedding = model.encode([query])
226
+ paragraph_embeddings = model.encode(paragraphs)
227
+
228
+ # Calculate similarities
229
+ similarities = cosine_similarity(query_embedding, paragraph_embeddings)[0]
230
+
231
+ # Get top results
232
+ top_indices = np.argsort(similarities)[::-1][:max_results]
233
+
234
+ results = []
235
+ for i, idx in enumerate(top_indices):
236
+ results.append({
237
+ "rank": i + 1,
238
+ "similarity_score": round(float(similarities[idx]), 4),
239
+ "content": paragraphs[idx],
240
+ "content_length": len(paragraphs[idx])
241
+ })
242
+
243
+ return {
244
+ "status": "success",
245
+ "message": f"Found {len(results)} relevant paragraphs",
246
+ "results": results,
247
+ "total_documents": len(paragraphs),
248
+ "query": query
249
+ }
250
+
251
+ except Exception as e:
252
+ return {
253
+ "status": "error",
254
+ "message": f"Error: {str(e)}",
255
+ "results": []
256
+ }
257
+
258
+ # Create Gradio interfaces
259
+ demo_similarity = gr.Interface(
260
+ fn=semantic_similarity,
261
+ inputs=[
262
+ gr.Textbox(placeholder="Enter first text...", label="Text 1", lines=3, value="I love machine learning and AI"),
263
+ gr.Textbox(placeholder="Enter second text...", label="Text 2", lines=3, value="Artificial intelligence and ML are fascinating")
264
+ ],
265
+ outputs=gr.JSON(),
266
+ title="🔗 Semantic Similarity",
267
+ description="Calculate semantic similarity between two texts using embeddings"
268
+ )
269
+
270
+ demo_find_similar = gr.Interface(
271
+ fn=find_similar_sentences,
272
+ inputs=[
273
+ gr.Textbox(placeholder="Search query...", label="Query", value="machine learning"),
274
+ gr.Textbox(placeholder="Document text...", label="Document", lines=5, value="Machine learning is a subset of AI. Deep learning uses neural networks. Natural language processing handles text."),
275
+ gr.Slider(1, 10, value=3, label="Number of Results")
276
+ ],
277
+ outputs=gr.JSON(),
278
+ title="🎯 Find Similar Sentences",
279
+ description="Find sentences in a document most similar to your query"
280
+ )
281
+
282
+ demo_keywords = gr.Interface(
283
+ fn=extract_semantic_keywords,
284
+ inputs=[
285
+ gr.Textbox(placeholder="Text to extract keywords from...", label="Text", lines=5, value="Machine learning and artificial intelligence are transforming technology"),
286
+ gr.Slider(1, 20, value=10, label="Max Keywords")
287
+ ],
288
+ outputs=gr.JSON(),
289
+ title="🏷️ Keyword Extraction",
290
+ description="Extract relevant keywords and phrases from text"
291
+ )
292
+
293
+ demo_search = gr.Interface(
294
+ fn=semantic_search_in_text,
295
+ inputs=[
296
+ gr.Textbox(placeholder="Search query...", label="Search Query", value="neural networks"),
297
+ gr.Textbox(placeholder="Documents (separated by empty lines)...", label="Documents", lines=8, value="Deep learning uses neural networks.\n\nMachine learning algorithms learn patterns.\n\nAI systems can process natural language."),
298
+ gr.Slider(1, 10, value=5, label="Max Results")
299
+ ],
300
+ outputs=gr.JSON(),
301
+ title="🔍 Semantic Text Search",
302
+ description="Search for relevant content within provided documents using semantic similarity"
303
+ )
304
+
305
+ # Combine all interfaces
306
+ demo = gr.TabbedInterface(
307
+ [demo_similarity, demo_find_similar, demo_keywords, demo_search],
308
+ ["Similarity", "Find Sentences", "Keywords", "Search in Text"],
309
+ title="🧠 Semantic Analysis Suite (Stateless)"
310
+ )
311
+
312
+ if __name__ == "__main__":
313
+ demo.launch(mcp_server=True)