probejie commited on
Commit
7164129
·
verified ·
1 Parent(s): 02f5e41

Upload folder using huggingface_hub

Browse files
contriever/document_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:479fc547765b1eef5e80ba8b603485eb9e6d430c472ed69c9568bc42f318df4c
3
+ size 15919002752
contriever/encode.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import json
4
+ from tqdm import tqdm
5
+ import os
6
+ from torch.nn import DataParallel
7
+ from transformers import AutoTokenizer, AutoModel, T5EncoderModel
8
+ import torch
9
+ from sentence_transformers import SentenceTransformer
10
+ from multiprocessing import Pool
11
+ import time
12
+
13
+ start_time = time.time()
14
+
15
+ with open("merged_triple_processed_new_withID.json", "r") as fi:
16
+ data = json.load(fi)
17
+
18
+ sentences = [_['contents'] for _ in data]
19
+ print("Chunks nums: ", len(sentences))
20
+
21
+ # model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
22
+ # model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
23
+ model_path = 'facebook/contriever'
24
+
25
+ ### Using SentenceTransformer
26
+ # def encode_sentences_on_gpu(params):
27
+ # sentences_chunk, device_id = params
28
+ # device = torch.device(f'cuda:{device_id}')
29
+ # model = SentenceTransformer(model_path, device=device)
30
+ # embeddings = model.encode(
31
+ # sentences_chunk,
32
+ # batch_size=1024,
33
+ # show_progress_bar=False,
34
+ # convert_to_numpy=True,
35
+ # normalize_embeddings=True
36
+ # )
37
+ # return embeddings
38
+
39
+
40
+ # num_gpus = torch.cuda.device_count()
41
+ # print(f"Number of GPUs: {num_gpus}")
42
+
43
+ # sentences_chunks = np.array_split(sentences, num_gpus)
44
+ # params = [(sentences_chunks[i], i) for i in range(num_gpus)]
45
+
46
+ # with Pool(processes=num_gpus) as pool:
47
+ # embeddings_list = pool.map(encode_sentences_on_gpu, params)
48
+
49
+
50
+ # sentence_embeddings = np.concatenate(embeddings_list, axis=0)
51
+
52
+
53
+ ### Using Transformers
54
+
55
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
56
+ model = AutoModel.from_pretrained(model_path)
57
+ # model = T5EncoderModel.from_pretrained(model_path)
58
+ model = DataParallel(model) # Wrap the model for multi-GPU support
59
+ model.eval()
60
+
61
+
62
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
63
+ model.to(device)
64
+
65
+ batch_size = 1024
66
+
67
+ def mean_pooling(token_embeddings, mask):
68
+ token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
69
+ sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
70
+ return sentence_embeddings
71
+
72
+ def process_in_batches(sentences, batch_size):
73
+ sentence_embeddings_list = []
74
+ for i in tqdm(range(0, len(sentences), batch_size)):
75
+ batch_sentences = sentences[i:i + batch_size]
76
+ encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt').to(device)
77
+
78
+ with torch.no_grad():
79
+ model_output = model(**encoded_input)
80
+ batch_sentence_embeddings = mean_pooling(model_output[0], encoded_input['attention_mask'])
81
+
82
+ # CLS pooling for BGE
83
+ # batch_sentence_embeddings = model_output[0][:, 0]
84
+ # pooling for GTR
85
+ # batch_sentence_embeddings = model_output.last_hidden_state.mean(dim=1)
86
+
87
+ # batch_sentence_embeddings = torch.nn.functional.normalize(batch_sentence_embeddings, p=2, dim=1)
88
+ sentence_embeddings_list.append(batch_sentence_embeddings.cpu()) # Move to CPU to save GPU memory
89
+
90
+
91
+ sentence_embeddings = torch.cat(sentence_embeddings_list, dim=0)
92
+ return sentence_embeddings
93
+
94
+
95
+ sentence_embeddings = process_in_batches(sentences, batch_size)
96
+
97
+ sentence_embeddings = sentence_embeddings.cpu().numpy()
98
+
99
+
100
+ # Create a FAISS index
101
+ dim = sentence_embeddings.shape[1]
102
+ faiss_index = faiss.IndexFlatIP(dim) # Inner product for cosine similarity
103
+
104
+ faiss_index.add(sentence_embeddings)
105
+
106
+ faiss_index_file = 'faiss_index.bin'
107
+ faiss.write_index(faiss_index, faiss_index_file)
108
+ print(f"FAISS index saved to {faiss_index_file}")
109
+
110
+ embeddings_file = 'document_embeddings.npy'
111
+ np.save(embeddings_file, sentence_embeddings)
112
+ print(f"Document embeddings saved to {embeddings_file}")
113
+
114
+ end_time = time.time()
115
+ execution_time_hours = (end_time - start_time) / 3600
116
+ print(f"Total execution time: {execution_time_hours:.2f} hours")
117
+
118
+
119
+ # instruction = "Represent this sentence for searching relevant passages: "
120
+ # queries = ["Who is the president of U.S.A.?"]
121
+
122
+ # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
123
+
124
+ # # Compute token embeddings
125
+ # with torch.no_grad():
126
+ # model_output = model(**encoded_input)
127
+ # # Perform pooling. In this case, cls pooling.
128
+ # sentence_embeddings = model_output[0][:, 0]
129
+ # # normalize embeddings
130
+ # query_vector = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
131
+
132
+
133
+ # k = 5 # Number of nearest neighbors to retrieve
134
+ # distances, indices = faiss_index.search(np.array([query_vector.cpu()], dtype=np.float32), k)
135
+
136
+ # # Print the most similar documents
137
+ # for i, index in enumerate(indices[0]):
138
+ # distance = distances[0][i]
139
+ # print(f"Nearest neighbor {i+1}: {documents[index]}, Distance {distance}")
contriever/faiss_index.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bae4a59032acb07ebc57f7df3cad62932027fed3692454ba6e62b09a75d8e09b
3
+ size 15919002669