Spaces:

somewheresystems
/

dataclysm

Paused

App Files Files Community

somewheresy commited on Jan 24, 2024

Commit

ac664de

verified ·

1 Parent(s): a7193d8

Update app.py

Browse files

Files changed (1) hide show

app.py +290 -53

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Import necessary libraries
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -6,16 +5,58 @@ from sklearn.manifold import TSNE
 from datasets import load_dataset, Dataset
 from sklearn.cluster import KMeans
 import plotly.graph_objects as go
-import time
 import logging
 # Additional libraries for querying
 from FlagEmbedding import FlagModel
 # Global variables and dataset loading
 global dataset_name
-dataset_name = 'somewheresystems/dataclysm-arxiv'
 st.session_state.dataclysm_arxiv = load_dataset(dataset_name, split="train")
 total_samples = len(st.session_state.dataclysm_arxiv)
@@ -75,7 +116,7 @@ def perform_tsne(embeddings):
     tsne_results = tsne.fit_transform(np.vstack(embeddings.tolist()))
     # Update progress bar to indicate completion
-    progress_text.text(f"t-SNE completed. Processed {n_samples} samples with perplexity {perplexity}.")
     end_time = time.time()  # End timing
     st.sidebar.text(f't-SNE completed in {end_time - start_time:.3f} seconds')
     return tsne_results
@@ -83,20 +124,71 @@ def perform_tsne(embeddings):
 def perform_clustering(df, tsne_results):
     start_time = time.time()
-    # Perform KMeans clustering
-    logging.info('Performing k-means clustering...')
     # Step 3: Visualization with Plotly
-    df['tsne-3d-one'] = tsne_results[:,0]
-    df['tsne-3d-two'] = tsne_results[:,1]
-    df['tsne-3d-three'] = tsne_results[:,2]
-    # Perform KMeans clustering
-    kmeans = KMeans(n_clusters=16)  # Change the number of clusters as needed
-    df['cluster'] = kmeans.fit_predict(df[['tsne-3d-one', 'tsne-3d-two', 'tsne-3d-three']])
     end_time = time.time()  # End timing
-    st.sidebar.text(f'k-means clustering completed in {end_time - start_time:.3f} seconds')
     return df
 def main():
     # Custom CSS
     custom_css = """
@@ -112,48 +204,184 @@ def main():
             color: #F8F8F8; /* Set the font color to F8F8F8 */
         }
         /* Add your CSS styles here */
         h1 {
             text-align: center;
         }
         h2,h3,h4 {
             text-align: justify;
-            font-size: 8px
         }
         body {
-            text-align: justify;
         }
         .stSlider .css-1cpxqw2 {
             background: #202020;
         }
         .stButton > button {
             background-color: #202020;
-            width: 100%;
-            border: none;
             padding: 10px 24px;
-            border-radius: 5px;
             font-size: 16px;
             font-weight: bold;
         }
         .reportview-container .main .block-container {
-            padding: 2rem;
             background-color: #202020;
         }
     </style>
     """
     # Inject custom CSS with markdown
     st.markdown(custom_css, unsafe_allow_html=True)
     st.sidebar.markdown(
-        f'<img src="https://www.somewhere.systems/S2-white-logo.png" style="float: bottom-left; width: 32px; height: 32px; opacity: 1.0; animation: fadein 2s;">',
         unsafe_allow_html=True
     )
-    st.sidebar.title('Spatial Search Engine')
     # Check if data needs to be loaded
     if 'data_loaded' not in st.session_state or not st.session_state.data_loaded:
         # User input for number of samples
-        num_samples = st.sidebar.slider('Select number of samples', 1000, total_samples, 1000)
         if st.sidebar.button('Initialize'):
             st.sidebar.text('Initializing data pipeline...')
@@ -171,8 +399,6 @@ def main():
                 print(f"FAISS index for {column_name} added.")
                 return dataset
             # Load data and perform t-SNE and clustering
             df, embeddings = load_data(num_samples)
@@ -209,35 +435,46 @@ def main():
                 marker=dict(
                     size=1,
                     color=df['cluster'],
-                    colorscale='Viridis',
-                    opacity=0.8
                 )
             )])
             fig.update_layout(
-                plot_bgcolor='#202020',
                 height=800,
                 margin=dict(l=0, r=0, b=0, t=0),
-                scene=dict(
-                    xaxis=dict(showbackground=True, backgroundcolor="#000000"),
-                    yaxis=dict(showbackground=True, backgroundcolor="#000000"),
-                    zaxis=dict(showbackground=True, backgroundcolor="#000000"),
-                ),
-                scene_camera=dict(eye=dict(x=0.001, y=0.001, z=0.001))
             )
             st.session_state.fig = fig
     # Display the plot if data is loaded
     if 'data_loaded' in st.session_state and st.session_state.data_loaded:
-        st.plotly_chart(st.session_state.fig, use_container_width=True)
     # Sidebar for detailed view
     if 'df' in st.session_state:
         # Sidebar for querying
         with st.sidebar:
-            st.sidebar.markdown("### Query Embeddings")
-            query = st.text_input("Enter your query:")
             if st.button("Search"):
                 # Define the model
                 print("Initializing model...")
@@ -248,7 +485,7 @@ def main():
                 query_embedding = model.encode([query])
                 # Retrieve examples by title similarity (or abstract, depending on your preference)
-                scores_title, retrieved_examples_title = st.session_state.dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=10)
                 df_query = pd.DataFrame(retrieved_examples_title)
                 df_query['proximity'] = scores_title
                 df_query = df_query.sort_values(by='proximity', ascending=True)
@@ -257,19 +494,19 @@ def main():
                 # Fix the <a href link> to display properly
                 df_query['URL'] = df_query['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}" target="_blank">Link</a>')
                 st.sidebar.markdown(df_query[['title', 'proximity', 'id']].to_html(escape=False), unsafe_allow_html=True)
-            st.sidebar.markdown("# Detailed View")
-            selected_index = st.sidebar.selectbox("Select Key", st.session_state.df.id)
-            # Display metadata for the selected article
-            selected_row = st.session_state.df[st.session_state.df['id'] == selected_index].iloc[0]
-            st.markdown(f"### Title\n{selected_row['title']}", unsafe_allow_html=True)
-            st.markdown(f"### Abstract\n{selected_row['abstract']}", unsafe_allow_html=True)
-            st.markdown(f"[Read the full paper](https://arxiv.org/abs/{selected_row['id']})", unsafe_allow_html=True)
-            st.markdown(f"[Download PDF](https://arxiv.org/pdf/{selected_row['id']})", unsafe_allow_html=True)
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import pandas as pd
 import numpy as np
 from datasets import load_dataset, Dataset
 from sklearn.cluster import KMeans
 import plotly.graph_objects as go
+import time, random, datetime
 import logging
+from sklearn.cluster import HDBSCAN
+BACKGROUND_COLOR = 'black'
+COLOR = 'white'
+def set_page_container_style(
+        max_width: int = 10000, max_width_100_percent: bool = False,
+        padding_top: int = 1, padding_right: int = 10, padding_left: int = 1, padding_bottom: int = 10,
+        color: str = COLOR, background_color: str = BACKGROUND_COLOR,
+    ):
+        if max_width_100_percent:
+            max_width_str = f'max-width: 100%;'
+        else:
+            max_width_str = f'max-width: {max_width}px;'
+        st.markdown(
+            f'''
+            <style>
+                .reportview-container .css-1lcbmhc .css-1outpf7 {{
+                    padding-top: 35px;
+                }}
+                .reportview-container .main .block-container {{
+                    {max_width_str}
+                    padding-top: {padding_top}rem;
+                    padding-right: {padding_right}rem;
+                    padding-left: {padding_left}rem;
+                    padding-bottom: {padding_bottom}rem;
+                }}
+                .reportview-container .main {{
+                    color: {color};
+                    background-color: {background_color};
+                }}
+            </style>
+            ''',
+            unsafe_allow_html=True,
+        )
 # Additional libraries for querying
 from FlagEmbedding import FlagModel
 # Global variables and dataset loading
 global dataset_name
+st.set_page_config(layout="wide")
+dataset_name = "somewheresystems/dataclysm-arxiv"
+set_page_container_style(
+        max_width = 1600, max_width_100_percent = True,
+        padding_top = 0, padding_right = 10, padding_left = 5, padding_bottom = 10
+)
 st.session_state.dataclysm_arxiv = load_dataset(dataset_name, split="train")
 total_samples = len(st.session_state.dataclysm_arxiv)
     tsne_results = tsne.fit_transform(np.vstack(embeddings.tolist()))
     # Update progress bar to indicate completion
+    progress_text.text(f"t-SNE completed at {datetime.datetime.now()}. Processed {n_samples} samples with perplexity {perplexity}.")
     end_time = time.time()  # End timing
     st.sidebar.text(f't-SNE completed in {end_time - start_time:.3f} seconds')
     return tsne_results
 def perform_clustering(df, tsne_results):
     start_time = time.time()
+    # Perform DBSCAN clustering
+    logging.info('Performing HDBSCAN clustering...')
     # Step 3: Visualization with Plotly
+    # Normalize the t-SNE results between 0 and 1
+    df['tsne-3d-one'] = (tsne_results[:,0] - tsne_results[:,0].min()) / (tsne_results[:,0].max() - tsne_results[:,0].min())
+    df['tsne-3d-two'] = (tsne_results[:,1] - tsne_results[:,1].min()) / (tsne_results[:,1].max() - tsne_results[:,1].min())
+    df['tsne-3d-three'] = (tsne_results[:,2] - tsne_results[:,2].min()) / (tsne_results[:,2].max() - tsne_results[:,2].min())
+    # Perform DBSCAN clustering
+    hdbscan = HDBSCAN(min_cluster_size=10, min_samples=50)
+    cluster_labels = hdbscan.fit_predict(df[['tsne-3d-one', 'tsne-3d-two', 'tsne-3d-three']])
+    df['cluster'] = cluster_labels
     end_time = time.time()  # End timing
+    st.sidebar.text(f'HDBSCAN clustering completed in {end_time - start_time:.3f} seconds')
     return df
+def update_camera_position(fig, df, df_query, result_id, K=10):
+    # Focus the camera on the closest result
+    top_K_ids = df_query.sort_values(by='proximity', ascending=True).head(K)['id'].tolist()
+    top_K_proximity = df_query['proximity'].tolist()
+    top_results = df[df['id'].isin(top_K_ids)]
+    camera_focus = dict(
+        eye=dict(x=top_results.iloc[0]['tsne-3d-one']*0.1, y=top_results.iloc[0]['tsne-3d-two']*0.1, z=top_results.iloc[0]['tsne-3d-three']*0.1)
+    )
+    # Normalize the proximity values to range between 1 and 10
+    normalized_proximity = [10 - (10 * (prox - min(top_K_proximity)) / (max(top_K_proximity) - min(top_K_proximity))) for prox in top_K_proximity]
+    # Create a dictionary mapping id to normalized proximity
+    id_to_proximity = dict(zip(top_K_ids, normalized_proximity))
+    # Set marker sizes based on proximity for top K ids, all other points stay the same -- 500% zoom
+    marker_sizes = [5 * id_to_proximity[id] if id in top_K_ids else 1 for id in df['id']]
+    # Store the original colors in a separate column
+    df['color'] = df['cluster']
+    fig = go.Figure(data=[go.Scatter3d(
+        x=df['tsne-3d-one'],
+        y=df['tsne-3d-two'],
+        z=df['tsne-3d-three'],
+        mode='markers',
+        marker=dict(size=marker_sizes, color=df['color'], colorscale='Viridis', opacity=0.8, line_width=0),
+        hovertext=df['hovertext'],
+        hoverinfo='text',
+    )])
+    # Set grid opacity to 10%
+    fig.update_layout(scene = dict(xaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
+                                    yaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
+                                    zaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)')))
+    # Add lines stemming from the first point to all other points in the top K
+    for i in range(1, K):  # there are K-1 lines from the first point to the other K-1 points
+        fig.add_trace(go.Scatter3d(
+            x=[top_results.iloc[0]['tsne-3d-one'], top_results.iloc[i]['tsne-3d-one']],
+            y=[top_results.iloc[0]['tsne-3d-two'], top_results.iloc[i]['tsne-3d-two']],
+            z=[top_results.iloc[0]['tsne-3d-three'], top_results.iloc[i]['tsne-3d-three']],
+            mode='lines',
+            line=dict(color='white',width=0.3),  # Set line opacity to 50%
+            showlegend=True,
+            name="centroid" if i == -1 else top_results.iloc[i]['id'],  # Set the legend to "Top Result" for the first entry, and to the title of the article for the rest
+            hovertext=f'Title: Top K Results\nID: {top_K_ids[i]}, Proximity: {round(top_K_proximity[i], 4)}',
+            hoverinfo='text',
+        ))
+    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)',
+                paper_bgcolor='rgba(0,0,0,0)',
+                scene_camera=camera_focus)
+    return fig
 def main():
     # Custom CSS
     custom_css = """
             color: #F8F8F8; /* Set the font color to F8F8F8 */
         }
         /* Add your CSS styles here */
+        .stPlotlyChart {
+            width: 100%;
+            height: 100%;
+        /* Other styles... */
+        }
         h1 {
             text-align: center;
         }
         h2,h3,h4 {
             text-align: justify;
+            font-size: 8px;
+        }
+        st-emotion-cache-1wmy9hl {
+            font-size: 8px;
         }
         body {
+            color: #fff;
+            background-color: #202020;
         }
         .stSlider .css-1cpxqw2 {
             background: #202020;
+            color: #fd5137;
+        }
+        .stSlider .text {
+            background: #202020;
+            color: #fd5137;
         }
         .stButton > button {
             background-color: #202020;
+            width: 60%;
+            margin-left: auto;
+            margin-right: auto;
+            display: block;
             padding: 10px 24px;
             font-size: 16px;
             font-weight: bold;
+            border: 1px solid #f8f8f8;
+        }
+        .stButton > button:hover {
+            color: #Fd5137
+            border: 1px solid #fd5137;
+        }
+        .stButton > button:active {
+            color: #F8F8F8;
+            border: 1px solid #fd5137;
+            background-color: #fd5137;
         }
         .reportview-container .main .block-container {
+            padding: 0;
             background-color: #202020;
+            width: 100%; /* Make the plotly graph take up full width */
+        }
+        .sidebar .sidebar-content {
+            background-image: linear-gradient(#202020,#202020);
+            color: white;
+            size: 0.2em; /* Make the text in the sidebar smaller */
+            padding: 0;
         }
+        .reportview-container .main .block-container {
+            background-color: #000000;
+        }
+        .stText {
+            padding: 0;
+        }
+        /* Set the main background color to #202020 */
+        .appview-container {
+            background-color: #000000;
+            padding: 0;
+        }
+        .stVerticalBlockBorderWrapper{
+            padding: 0;
+            margin-left: 0px;
+        }
+        .st-emotion-cache-1cypcdb {
+            background-color: #202020;
+            background-image: none;
+            color: #000000;
+            padding: 0;
+        }
+        .stPlotlyChart {
+            background-color: #000000;
+            background-image: none;
+            color: #000000;
+            padding: 0;
+        }
+        .reportview-container .css-1lcbmhc .css-1outpf7 {
+            padding-top: 35px;
+        }
+        .reportview-container .main .block-container {
+            max-width: 100%;
+            padding-top: 0rem;
+            padding-right: 0rem;
+            padding-left: 0rem;
+            padding-bottom: 10rem;
+        }
+        .reportview-container .main {
+            color: white;
+            background-color: black;
+        }
+        .st-emotion-cache-1avcm0n {
+            color: black;
+            background-color: black;
+        }
+        .st-emotion-cache-z5fcl4 {
+            padding-left: 0.1rem;
+            padding-right: 0.1rem;
+        }
+        .st-emotion-cache-z5fcl4 {
+            width: 100%;
+            padding: 3rem 1rem 1rem;
+            min-width: auto;
+            max-width: initial;
+        }
+        .st-emotion-cache-uf99v8 {
+            display: flex;
+            flex-direction: column;
+            width: 100%;
+            overflow: hidden;
+            -webkit-box-align: center;
+            align-items: center;
+        }
     </style>
     """
     # Inject custom CSS with markdown
     st.markdown(custom_css, unsafe_allow_html=True)
+    st.sidebar.title('arXiv Spatial Search Engine')
     st.sidebar.markdown(
+        '<a href="http://dataclysm.xyz" target="_blank" style="display: flex; justify-content: center; padding: 10px;">dataclysm.xyz <img src="https://www.somewhere.systems/S2-white-logo.png" style="width: 8px; height: 8px;"></a>',
         unsafe_allow_html=True
     )
+    # Create a placeholder for the chart
+    chart_placeholder = st.empty()
     # Check if data needs to be loaded
     if 'data_loaded' not in st.session_state or not st.session_state.data_loaded:
         # User input for number of samples
+        num_samples = st.sidebar.slider('Select number of samples', 1000, int(round(total_samples/10)), 1000)
+        if 'fig' not in st.session_state:
+            with open('prayers.txt', 'r') as file:
+                lines = file.readlines()
+                random_line = random.choice(lines).strip()
+            st.session_state.fig = go.Figure(data=[go.Scatter3d(x=[], y=[], z=[], mode='markers')])
+            st.session_state.fig.add_annotation(
+                x=0.5,
+                y=0.5,
+                xref="paper",
+                yref="paper",
+                text=random_line,
+                showarrow=False,
+                font=dict(
+                    size=16,
+                    color="black"
+                ),
+                align="center",
+                ax=0,
+                ay=0,
+                bordercolor="black",
+                borderwidth=2,
+                borderpad=4,
+                bgcolor="white",
+                opacity=0.8
+            )
+            # Set grid opacity to 10%
+            st.session_state.fig.update_layout(scene = dict(xaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
+                                           yaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
+                                           zaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)')))
+            st.session_state.fig.update_layout(
+                plot_bgcolor='rgba(0,0,0,0)',
+                paper_bgcolor='rgba(0,0,0,0)',
+                height=888,
+                margin=dict(l=0, r=0, b=0, t=0),
+                scene_camera=dict(eye=dict(x=0.1, y=0.1, z=0.1))
+            )
+        chart_placeholder.plotly_chart(st.session_state.fig, use_container_width=True)
         if st.sidebar.button('Initialize'):
             st.sidebar.text('Initializing data pipeline...')
                 print(f"FAISS index for {column_name} added.")
                 return dataset
             # Load data and perform t-SNE and clustering
             df, embeddings = load_data(num_samples)
                 marker=dict(
                     size=1,
                     color=df['cluster'],
+                    colorscale='Jet',
+                    opacity=0.75
                 )
             )])
+            # Set grid opacity to 10%
+            fig.update_layout(scene = dict(xaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
+                                           yaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)'),
+                                           zaxis = dict(gridcolor='rgba(128, 128, 128, 0.1)', color='rgba(128, 128, 128, 0.1)')))
             fig.update_layout(
+                plot_bgcolor='rgba(0,0,0,0)',
+                paper_bgcolor='rgba(0,0,0,0)',
                 height=800,
                 margin=dict(l=0, r=0, b=0, t=0),
+                scene_camera=dict(eye=dict(x=0.1, y=0.1, z=0.1))
             )
             st.session_state.fig = fig
     # Display the plot if data is loaded
     if 'data_loaded' in st.session_state and st.session_state.data_loaded:
+        chart_placeholder.plotly_chart(st.session_state.fig, use_container_width=True)
     # Sidebar for detailed view
     if 'df' in st.session_state:
         # Sidebar for querying
         with st.sidebar:
+            st.sidebar.markdown("# Detailed View")
+            selected_index = st.sidebar.selectbox("Select Key", st.session_state.df.id)
+            # Display metadata for the selected article
+            selected_row = st.session_state.df[st.session_state.df['id'] == selected_index].iloc[0]
+            st.markdown(f"### Title\n{selected_row['title']}", unsafe_allow_html=True)
+            st.markdown(f"### Abstract\n{selected_row['abstract']}", unsafe_allow_html=True)
+            st.markdown(f"[Read the full paper](https://arxiv.org/abs/{selected_row['id']})", unsafe_allow_html=True)
+            st.markdown(f"[Download PDF](https://arxiv.org/pdf/{selected_row['id']})", unsafe_allow_html=True)
+            st.sidebar.markdown("### Find Similar in Latent Space")
+            query = st.text_input("", value=selected_row['title'])
+            top_k = st.slider("top k", 1, 100, 10)
             if st.button("Search"):
                 # Define the model
                 print("Initializing model...")
                 query_embedding = model.encode([query])
                 # Retrieve examples by title similarity (or abstract, depending on your preference)
+                scores_title, retrieved_examples_title = st.session_state.dataclysm_title_indexed.get_nearest_examples('title_embedding', query_embedding, k=top_k)
                 df_query = pd.DataFrame(retrieved_examples_title)
                 df_query['proximity'] = scores_title
                 df_query = df_query.sort_values(by='proximity', ascending=True)
                 # Fix the <a href link> to display properly
                 df_query['URL'] = df_query['id'].apply(lambda x: f'<a href="https://arxiv.org/abs/{x}" target="_blank">Link</a>')
                 st.sidebar.markdown(df_query[['title', 'proximity', 'id']].to_html(escape=False), unsafe_allow_html=True)
+                # Get the ID of the top search result
+                top_result_id = df_query.iloc[0]['id']
+                # Update the camera position and appearance of points
+                updated_fig = update_camera_position(st.session_state.fig, st.session_state.df, df_query, top_result_id,top_k)
+                # Update the figure in the session state and redraw the plot
+                st.session_state.fig = updated_fig
+                # Update the chart using the placeholder
+                chart_placeholder.plotly_chart(st.session_state.fig, use_container_width=True)
+if __name__ == "__main__":
+    main()