Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """ | |
| Split a large file into shards while respecting document boundaries. Documents | |
| should be separated by a single empty line. | |
| """ | |
| import argparse | |
| import contextlib | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("input") | |
| parser.add_argument("--num-shards", type=int) | |
| args = parser.parse_args() | |
| assert args.num_shards is not None and args.num_shards > 1 | |
| with open(args.input, "r", encoding="utf-8") as h: | |
| with contextlib.ExitStack() as stack: | |
| outputs = [ | |
| stack.enter_context( | |
| open(args.input + ".shard" + str(i), "w", encoding="utf-8") | |
| ) | |
| for i in range(args.num_shards) | |
| ] | |
| doc = [] | |
| first_doc = [True] * args.num_shards | |
| def output_doc(i): | |
| if not first_doc[i]: | |
| outputs[i].write("\n") | |
| first_doc[i] = False | |
| for line in doc: | |
| outputs[i].write(line) | |
| doc.clear() | |
| num_docs = 0 | |
| for line in h: | |
| if line.strip() == "": # empty line indicates new document | |
| output_doc(num_docs % args.num_shards) | |
| num_docs += 1 | |
| else: | |
| doc.append(line) | |
| output_doc(num_docs % args.num_shards) | |
| if __name__ == "__main__": | |
| main() | |