Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """ | |
| Count the number of documents and average number of lines and tokens per | |
| document in a large file. Documents should be separated by a single empty line. | |
| """ | |
| import argparse | |
| import gzip | |
| import sys | |
| import numpy as np | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("input") | |
| parser.add_argument("--gzip", action="store_true") | |
| args = parser.parse_args() | |
| def gopen(): | |
| if args.gzip: | |
| return gzip.open(args.input, "r") | |
| else: | |
| return open(args.input, "r", encoding="utf-8") | |
| num_lines = [] | |
| num_toks = [] | |
| with gopen() as h: | |
| num_docs = 1 | |
| num_lines_in_doc = 0 | |
| num_toks_in_doc = 0 | |
| for i, line in enumerate(h): | |
| if len(line.strip()) == 0: # empty line indicates new document | |
| num_docs += 1 | |
| num_lines.append(num_lines_in_doc) | |
| num_toks.append(num_toks_in_doc) | |
| num_lines_in_doc = 0 | |
| num_toks_in_doc = 0 | |
| else: | |
| num_lines_in_doc += 1 | |
| num_toks_in_doc += len(line.rstrip().split()) | |
| if i % 1000000 == 0: | |
| print(i, file=sys.stderr, end="", flush=True) | |
| elif i % 100000 == 0: | |
| print(".", file=sys.stderr, end="", flush=True) | |
| print(file=sys.stderr, flush=True) | |
| print("found {} docs".format(num_docs)) | |
| print("average num lines per doc: {}".format(np.mean(num_lines))) | |
| print("average num toks per doc: {}".format(np.mean(num_toks))) | |
| if __name__ == "__main__": | |
| main() | |