Spaces:
Runtime error
Runtime error
| import argparse | |
| import os | |
| import ssl | |
| from urllib import request | |
| from lxml import etree | |
| from tqdm import tqdm | |
| from fake_useragent import UserAgent | |
| ua = UserAgent() | |
| def process(args): | |
| # instanciate parser | |
| tree = etree.parse(args.html, parser=etree.HTMLParser(recover=True)) | |
| # get all the links | |
| blast_items = tree.xpath('//*[@id="root"]/div/div/div/main/div[2]/div[2]/section/div/div/span[6]/a/text()') | |
| context = ssl._create_unverified_context() | |
| base_url = "https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/result/" | |
| bar = tqdm(blast_items) | |
| names = [] | |
| for item in bar: | |
| bar.set_postfix({"current": item}) | |
| trg_url = base_url + item + "/accs" | |
| req = request.Request(trg_url, | |
| headers={ | |
| "Accept": "application/json", | |
| 'user-agent': ua.random | |
| }) | |
| res = request.urlopen(req, context=context) | |
| payload = [p[5:] for p in res.read().decode().split("\n")[:-1]] | |
| names.extend(payload) | |
| # remove duplicate | |
| names = list(set(names)) | |
| lenth = len(names) | |
| max_i = lenth//args.chunk_size+1 | |
| for i in range(max_i): | |
| names_ = names[i*args.chunk_size: (i+1)*args.chunk_size] | |
| with open(os.path.join(args.output, f"af_raw_{args.protein_name}_{i}.txt"), "w") as f: | |
| for name in names_: | |
| f.write(name+"\n") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--protein_name", type=str, default="CM", required=False) | |
| parser.add_argument("--html", type=str, default="data/CM/CM.html", required=False) | |
| parser.add_argument("--output", type=str, default="data/CM", required=False) | |
| parser.add_argument("--chunk_size", type=int, default=5000, required=False) | |
| args = parser.parse_args() | |
| process(args) |