Spaces:
Running
on
Zero
Running
on
Zero
| ''' | |
| This file is to prepare the dataset in jsonl file | |
| ''' | |
| import os, sys, shutil | |
| import json | |
| # Import files from the local folder | |
| root_path = os.path.abspath('.') | |
| sys.path.append(root_path) | |
| from curation_pipeline.prepare_bridge_v1 import read_bridge_v1 | |
| from curation_pipeline.prepare_bridge_v2 import read_bridge_v2 | |
| if __name__ == "__main__": | |
| v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v1/berkeley" | |
| v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2" | |
| store_name = "store.jsonl" | |
| if os.path.exists(store_name): | |
| os.remove(store_name) | |
| # Execute | |
| full_lists = [] | |
| v1_lists = read_bridge_v1(v1_dataset_path, "", copyfile=False) | |
| full_lists.extend(v1_lists) | |
| v2_lists = read_bridge_v2(v2_dataset_path, "", copyfile=False) | |
| full_lists.extend(v2_lists) | |
| print("Full length is ", len(full_lists)) | |
| with open(store_name, 'w') as outfile: | |
| for list_name in full_lists: | |
| instance = dict() | |
| instance["file_path"] = list_name | |
| json.dump(instance, outfile) | |
| outfile.write('\n') | |
| # with open('output.jsonl', 'w') as outfile: | |
| # for entry in JSON_file: | |
| # json.dump(entry, outfile) | |
| # outfile.write('\n') |