Aux script to split input data files into chunks

This commit is contained in:
Miguel M 2023-06-06 16:04:30 +01:00
parent 0ea106abcf
commit 9671205c15
2 changed files with 73 additions and 0 deletions

71
aux/split_data.py Normal file
View File

@ -0,0 +1,71 @@
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""Splits an input file into random chunks.
Usage:
split_data <input file> <chunks>
See [root]/data/splits/README for a rationale.
"""
import os
import argparse
CHUNK_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/splits"))
def get_args():
doclines = __doc__.splitlines()
description = doclines[0]
epilog = "\n".join(doclines[1:])
parser = argparse.ArgumentParser(description=description, epilog=epilog)
parser.add_argument("<input file>", type=str, help="Input file to split.")
parser.add_argument("<chunks>", type=int, help="Number of chunks to produce.")
args = vars(parser.parse_args())
return args
def chunk_fpath(source_path, chunk_i, chunk_total):
in_basename = os.path.basename(source_path)
out_basename = f"{chunk_total}_{chunk_i}_{in_basename}"
return os.path.join(CHUNK_DIR, out_basename)
def main():
args = get_args()
source_path = args["<input file>"]
chunk_count = args["<chunks>"]
with open(source_path, "r") as source_file:
line_count = 0
for _line in source_file:
line_count += 1
chunk_len = line_count // chunk_count
source_file.seek(0)
for chunk_i in range(chunk_count - 1):
outfile_name = chunk_fpath(source_path, chunk_i, chunk_count)
print(f'Writing {outfile_name}...')
with open(outfile_name, "w") as out_file:
chunk_i_len = 0
for line in source_file:
out_file.write(line)
chunk_i_len += 1
if chunk_i_len >= chunk_len:
break
if line_count % chunk_len != 0:
outfile_name = chunk_fpath(source_path, chunk_count - 1, chunk_count)
print(f'Writing {outfile_name}...')
with open(outfile_name, "w") as out_file:
for line in source_file:
out_file.write(line)
if __name__ == "__main__":
main()

2
data/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
splits/
!splits/.gitinclude