Spaces:
Build error
Build error
| import argparse | |
| import gzip | |
| import json | |
| import os | |
| from glob import glob | |
| from tqdm import tqdm | |
| tqdm.pandas() | |
| # Load trajectories for resolved instances | |
| def load_completions(output_dir: str, instance_id: str): | |
| glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json') | |
| files = sorted(glob(glob_path)) # this is ascending order | |
| # pick the last file (last turn) | |
| try: | |
| file_path = files[-1] | |
| except IndexError: | |
| # print(f'No files found for instance {instance_id}: files={files}') | |
| return None | |
| with open(file_path, 'r') as f: | |
| result = json.load(f) | |
| # create messages | |
| messages = result['messages'] | |
| messages.append(result['response']['choices'][0]['message']) | |
| tools = result['kwargs'].get('tools', []) | |
| return { | |
| 'messages': messages, | |
| 'tools': tools, | |
| } | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('jsonl_path', type=str) | |
| args = parser.parse_args() | |
| output_dir = os.path.dirname(args.jsonl_path) | |
| output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz') | |
| # Check if output would be different from input | |
| needs_update = False | |
| with open(args.jsonl_path, 'r') as f_in: | |
| for line in tqdm(f_in, desc='Checking for changes'): | |
| data = json.loads(line) | |
| new_completions = load_completions(output_dir, data['instance_id']) | |
| current_completions = data.get('raw_completions') | |
| if current_completions != new_completions: | |
| needs_update = True | |
| break | |
| if not needs_update: | |
| print('No updates required. Skipping file update.') | |
| exit(0) | |
| if os.path.exists(output_path): | |
| print(f'Output file already exists at {output_path}, overwriting? (y/n)') | |
| if input() != 'y': | |
| print('Exiting...') | |
| exit(0) | |
| # Process line by line | |
| with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out: | |
| for line in tqdm(f_in): | |
| data = json.loads(line) | |
| data['raw_completions'] = load_completions(output_dir, data['instance_id']) | |
| f_out.write(json.dumps(data) + '\n') | |
| print(f'Saved compressed output to {output_path}') | |