Aux script to split input data files into chunks
This commit is contained in:
parent
0ea106abcf
commit
9671205c15
|
@ -0,0 +1,71 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""Splits an input file into random chunks.
|
||||
|
||||
Usage:
|
||||
split_data <input file> <chunks>
|
||||
|
||||
See [root]/data/splits/README for a rationale.
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
|
||||
CHUNK_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/splits"))
|
||||
|
||||
|
||||
def get_args():
|
||||
doclines = __doc__.splitlines()
|
||||
description = doclines[0]
|
||||
epilog = "\n".join(doclines[1:])
|
||||
|
||||
parser = argparse.ArgumentParser(description=description, epilog=epilog)
|
||||
parser.add_argument("<input file>", type=str, help="Input file to split.")
|
||||
parser.add_argument("<chunks>", type=int, help="Number of chunks to produce.")
|
||||
args = vars(parser.parse_args())
|
||||
return args
|
||||
|
||||
|
||||
def chunk_fpath(source_path, chunk_i, chunk_total):
|
||||
in_basename = os.path.basename(source_path)
|
||||
out_basename = f"{chunk_total}_{chunk_i}_{in_basename}"
|
||||
return os.path.join(CHUNK_DIR, out_basename)
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
source_path = args["<input file>"]
|
||||
chunk_count = args["<chunks>"]
|
||||
|
||||
with open(source_path, "r") as source_file:
|
||||
line_count = 0
|
||||
for _line in source_file:
|
||||
line_count += 1
|
||||
|
||||
chunk_len = line_count // chunk_count
|
||||
|
||||
source_file.seek(0)
|
||||
|
||||
for chunk_i in range(chunk_count - 1):
|
||||
outfile_name = chunk_fpath(source_path, chunk_i, chunk_count)
|
||||
print(f'Writing {outfile_name}...')
|
||||
with open(outfile_name, "w") as out_file:
|
||||
chunk_i_len = 0
|
||||
for line in source_file:
|
||||
out_file.write(line)
|
||||
|
||||
chunk_i_len += 1
|
||||
if chunk_i_len >= chunk_len:
|
||||
break
|
||||
|
||||
if line_count % chunk_len != 0:
|
||||
outfile_name = chunk_fpath(source_path, chunk_count - 1, chunk_count)
|
||||
print(f'Writing {outfile_name}...')
|
||||
with open(outfile_name, "w") as out_file:
|
||||
for line in source_file:
|
||||
out_file.write(line)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,2 @@
|
|||
splits/
|
||||
!splits/.gitinclude
|
Loading…
Reference in New Issue