Spaces:
Runtime error
Runtime error
| import nltk | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| class TrigramBlock: | |
| def __init__(self): | |
| self.trigrams = set() | |
| def check_overlap(self, text): | |
| tokens = self._preprocess(text) | |
| trigrams = set(self._get_trigrams(tokens)) | |
| overlap = bool(self.trigrams & trigrams) | |
| self.trigrams |= trigrams | |
| return overlap | |
| def _preprocess(self, text): | |
| text = text.lower() | |
| text = ''.join([c for c in text if c.isalpha() or c.isspace()]) | |
| tokens = nltk.word_tokenize(text) | |
| return tokens | |
| def _get_trigrams(self, tokens): | |
| trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)] | |
| return trigrams | |
| def convert_sentence_df(sentJson, pred, true_proba, set_trigram_blocking): | |
| body = pd.DataFrame([(section, sent['text'].strip()) for section in 'IMRD' for sent in sentJson['body'][section]], | |
| columns=['section', 'text']).astype({'section': 'category', 'text': 'string'}) | |
| # 加上預測結果和機率 | |
| body['predict'] = pred.astype('bool') | |
| body['proba'] = true_proba.astype('float16') | |
| # 對每章節的提取句子進行 trigram blocking | |
| if set_trigram_blocking: | |
| for section in 'IMRD': | |
| block = TrigramBlock() | |
| temp = body.loc[(body['section'] == section) & (body['predict'] == True)].sort_values(by='proba', ascending=False) | |
| for i, row in temp.iterrows(): | |
| if block.check_overlap(row['text']): | |
| body.at[i, 'predict'] = False | |
| return body | |
| # 提取式方法 | |
| def extractive_method(sentJson, sentFeat, model, threshold=0.5, TGB=False): | |
| #預測 | |
| def predict(x): | |
| true_proba = model.predict_proba(x)[:, 1] | |
| # 如果沒有任何句子的預測機率大於閾值,則選取最大機率的句子為摘要句 | |
| if not np.any(true_proba > threshold): | |
| true_proba[true_proba == np.max(true_proba)] = 1 | |
| pred = (true_proba > threshold).astype('int') | |
| return pred, true_proba | |
| grouped = sentFeat.groupby('section') | |
| pred = np.array([]) | |
| true_proba = np.array([]) | |
| for group_name, group_data in grouped: | |
| pred_sec, true_proba_sec = predict(group_data) | |
| # Append to the NumPy arrays | |
| pred = np.append(pred, pred_sec) | |
| true_proba = np.append(true_proba, true_proba_sec) | |
| body = convert_sentence_df(sentJson, pred, true_proba, TGB) | |
| res = body[body['predict'] == True] | |
| ext = {i: ' '.join(res.groupby('section').get_group(i)['text']) for i in 'IMRD'} | |
| return ext | |
| def abstractive_method(ext, tokenizer, model, device='cpu'): | |
| abstr = {key: '' for key in 'IMRD'} | |
| for section in 'IMRD': | |
| text = ext[section] | |
| model_inputs = tokenizer(text, truncation=True, return_tensors='pt').input_ids | |
| outputs = model.generate(model_inputs.to(device)) | |
| abstr_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| abstr[section] = abstr_text | |
| return abstr | |
| # extractive summarizer | |
| def load_ExtModel(path): | |
| return pickle.load(open(path, 'rb')) | |
| # abstractive summarizer | |
| def load_AbstrModel(path, device='cpu'): | |
| model_checkpoint = path | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=1024) | |
| abstrModel = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) | |
| abstrModel = abstrModel.to(device) | |
| generation_config = { | |
| 'num_beams': 5, | |
| 'max_length': 512, | |
| 'min_length': 64, | |
| 'length_penalty': 2.0, | |
| 'early_stopping': True, | |
| 'no_repeat_ngram_size': 3 | |
| } | |
| abstrModel.config.update(generation_config) | |
| return tokenizer, abstrModel | |