diff --git a/python/chunk-file b/python/chunk-file new file mode 100755 index 0000000..f419a4e --- /dev/null +++ b/python/chunk-file @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +""" +Will break apart large files into smaller files +""" + +import argparse +import re +import os +import glob + + +def parse_args(): + argp = argparse.ArgumentParser( + description="Will break a large file into smaller files" + ) + + argp.add_argument( + 'large_file', + help="Full path for large file" + ) + + argp.add_argument( + '--prefix', '-p', + default='small', + help="Prefix for the smaller files (default: %(default)s)" + ) + + argp.add_argument( + '--extension', '-x', + default='txt', + help="Extension for the smaller files (default: %(default)s)" + ) + + argp.add_argument( + '--size', '-s', + default='10K', + help="File size limits (5K, 10M, 2G)" + ) + + argp.add_argument( + '--old', '-o', + action='store_true', + help="Reuse existing small files; default behavior is to remove existing files for new ones" + ) + + # Future functionality: + # argp.add_argument( + # '--count', '-c', + # type=int, + # default=1, + # help="Number of files to generate (default: %(default)s)" + # ) + + return argp.parse_args() + + +def main(): + args = parse_args() + + file_complete = False + file_count = 1 + + # get the chunk file size + size_value = args.size + size_value_parts = re.match(r'(\d+)([BbKkMmGgTt]?)', size_value).groups() + if size_value_parts[1]: + factor = ( + (size_value_parts[1].lower() == 'k' and 1) or + (size_value_parts[1].lower() == 'm' and 2) or + (size_value_parts[1].lower() == 'g' and 3) or + (size_value_parts[1].lower() == 't' and 4) + ) + size_limit = int(size_value_parts[0]) * 1024**factor + else: + size_limit - int(size_value_parts[0]) + + if not args.old: + print("Removing previous files...") + previous_files = glob.glob(f"{args.prefix}-*.{args.extension}") + + for previous_file in previous_files: + os.remove(previous_file) + + print(f"Begin chunking large file {args.large_file}...") + with open(args.large_file) as large_file: + for line in large_file: + output_filename = f"{args.prefix}-{file_count:00005}.{args.extension}" + + if not file_complete: + file_mode = 'a' if os.path.isfile(output_filename) else 'w' + + with open(output_filename, file_mode) as small_file: + small_file.write(line) + + with open(output_filename, 'r') as small_file: + small_file.seek(0, os.SEEK_END) + output_file_size = small_file.tell() + + if output_file_size > size_limit: + file_complete = True + + else: + file_complete = False + file_count += 1 + + print(f"Large file {args.large_file} chunked into {file_count} files no bigger than {args.size}") + + +if __name__ == "__main__": + main() +