Spaces:
Build error
Build error
| #!/usr/bin/python3 | |
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import argparse | |
| import fileinput | |
| import hashlib | |
| import sys | |
| from multiprocessing import Pool | |
| def get_hashes_and_lines(raw_line): | |
| hash = hashlib.md5(raw_line).hexdigest() | |
| return hash, raw_line | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--workers", type=int, default=10) | |
| parser.add_argument("files", nargs="*", help="input files") | |
| args = parser.parse_args() | |
| seen = set() | |
| with fileinput.input(args.files, mode="rb") as h: | |
| pool = Pool(args.workers) | |
| results = pool.imap_unordered(get_hashes_and_lines, h, 1000) | |
| for i, (hash, raw_line) in enumerate(results): | |
| if hash not in seen: | |
| seen.add(hash) | |
| sys.stdout.buffer.write(raw_line) | |
| if i % 1000000 == 0: | |
| print(i, file=sys.stderr, end="", flush=True) | |
| elif i % 100000 == 0: | |
| print(".", file=sys.stderr, end="", flush=True) | |
| print(file=sys.stderr, flush=True) | |
| if __name__ == "__main__": | |
| main() | |