Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

contriever/document_embeddings.npy +3 -0
contriever/encode.py +139 -0
contriever/faiss_index.bin +3 -0

contriever/document_embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:479fc547765b1eef5e80ba8b603485eb9e6d430c472ed69c9568bc42f318df4c
+size 15919002752

contriever/encode.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import faiss
+import numpy as np
+import json
+from tqdm import tqdm
+import os
+from torch.nn import DataParallel
+from transformers import AutoTokenizer, AutoModel, T5EncoderModel
+import torch
+from sentence_transformers import SentenceTransformer
+from multiprocessing import Pool
+import time
+start_time = time.time()
+with open("merged_triple_processed_new_withID.json", "r") as fi:
+    data = json.load(fi)
+sentences = [_['contents'] for _ in data]
+print("Chunks nums: ", len(sentences))
+# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
+# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
+model_path = 'facebook/contriever'
+### Using SentenceTransformer
+# def encode_sentences_on_gpu(params):
+#     sentences_chunk, device_id = params
+#     device = torch.device(f'cuda:{device_id}')
+#     model = SentenceTransformer(model_path, device=device)
+#     embeddings = model.encode(
+#         sentences_chunk,
+#         batch_size=1024,
+#         show_progress_bar=False,
+#         convert_to_numpy=True,
+#         normalize_embeddings=True
+#     )
+#     return embeddings
+# num_gpus = torch.cuda.device_count()
+# print(f"Number of GPUs: {num_gpus}")
+# sentences_chunks = np.array_split(sentences, num_gpus)
+# params = [(sentences_chunks[i], i) for i in range(num_gpus)]
+# with Pool(processes=num_gpus) as pool:
+#     embeddings_list = pool.map(encode_sentences_on_gpu, params)
+# sentence_embeddings = np.concatenate(embeddings_list, axis=0)
+### Using Transformers
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModel.from_pretrained(model_path)
+# model = T5EncoderModel.from_pretrained(model_path)
+model = DataParallel(model)  # Wrap the model for multi-GPU support
+model.eval()
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+batch_size = 1024
+def mean_pooling(token_embeddings, mask):
+    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
+    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
+    return sentence_embeddings
+def process_in_batches(sentences, batch_size):
+    sentence_embeddings_list = []
+    for i in tqdm(range(0, len(sentences), batch_size)):
+        batch_sentences = sentences[i:i + batch_size]
+        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt').to(device)
+        with torch.no_grad():
+            model_output = model(**encoded_input)
+            batch_sentence_embeddings = mean_pooling(model_output[0], encoded_input['attention_mask'])
+            # CLS pooling for BGE
+            # batch_sentence_embeddings = model_output[0][:, 0]
+            # pooling for GTR
+            # batch_sentence_embeddings = model_output.last_hidden_state.mean(dim=1)
+            # batch_sentence_embeddings = torch.nn.functional.normalize(batch_sentence_embeddings, p=2, dim=1)
+            sentence_embeddings_list.append(batch_sentence_embeddings.cpu())  # Move to CPU to save GPU memory
+    sentence_embeddings = torch.cat(sentence_embeddings_list, dim=0)
+    return sentence_embeddings
+sentence_embeddings = process_in_batches(sentences, batch_size)
+sentence_embeddings = sentence_embeddings.cpu().numpy()
+# Create a FAISS index
+dim = sentence_embeddings.shape[1]
+faiss_index = faiss.IndexFlatIP(dim)  # Inner product for cosine similarity
+faiss_index.add(sentence_embeddings)
+faiss_index_file = 'faiss_index.bin'
+faiss.write_index(faiss_index, faiss_index_file)
+print(f"FAISS index saved to {faiss_index_file}")
+embeddings_file = 'document_embeddings.npy'
+np.save(embeddings_file, sentence_embeddings)
+print(f"Document embeddings saved to {embeddings_file}")
+end_time = time.time()
+execution_time_hours = (end_time - start_time) / 3600
+print(f"Total execution time: {execution_time_hours:.2f} hours")
+# instruction = "Represent this sentence for searching relevant passages: "
+# queries = ["Who is the president of U.S.A.?"]
+# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
+# # Compute token embeddings
+# with torch.no_grad():
+#     model_output = model(**encoded_input)
+#     # Perform pooling. In this case, cls pooling.
+#     sentence_embeddings = model_output[0][:, 0]
+# # normalize embeddings
+# query_vector = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+# k = 5  # Number of nearest neighbors to retrieve
+# distances, indices = faiss_index.search(np.array([query_vector.cpu()], dtype=np.float32), k)
+# # Print the most similar documents
+# for i, index in enumerate(indices[0]):
+#     distance = distances[0][i]
+#     print(f"Nearest neighbor {i+1}: {documents[index]}, Distance {distance}")

contriever/faiss_index.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bae4a59032acb07ebc57f7df3cad62932027fed3692454ba6e62b09a75d8e09b
+size 15919002669