#!/usr/bin/env python3 """ Will break apart large files into smaller files """ import argparse import re import os import glob def parse_args(): argp = argparse.ArgumentParser( description="Will break a large file into smaller files" ) argp.add_argument( 'large_file', help="Full path for large file" ) argp.add_argument( '--prefix', '-p', default='small', help="Prefix for the smaller files (default: %(default)s)" ) argp.add_argument( '--extension', '-x', default='txt', help="Extension for the smaller files (default: %(default)s)" ) argp.add_argument( '--size', '-s', default='10K', help="File size limits (5K, 10M, 2G)" ) argp.add_argument( '--old', '-o', action='store_true', help="Reuse existing small files; default behavior is to remove existing files for new ones" ) # Future functionality: # argp.add_argument( # '--count', '-c', # type=int, # default=1, # help="Number of files to generate (default: %(default)s)" # ) return argp.parse_args() def main(): args = parse_args() file_complete = False file_count = 1 # get the chunk file size size_value = args.size size_value_parts = re.match(r'(\d+)([BbKkMmGgTt]?)', size_value).groups() if size_value_parts[1]: factor = ( (size_value_parts[1].lower() == 'k' and 1) or (size_value_parts[1].lower() == 'm' and 2) or (size_value_parts[1].lower() == 'g' and 3) or (size_value_parts[1].lower() == 't' and 4) ) size_limit = int(size_value_parts[0]) * 1024**factor else: size_limit - int(size_value_parts[0]) if not args.old: print("Removing previous files...") previous_files = glob.glob(f"{args.prefix}-*.{args.extension}") for previous_file in previous_files: os.remove(previous_file) print(f"Begin chunking large file {args.large_file}...") with open(args.large_file) as large_file: for line in large_file: output_filename = f"{args.prefix}-{file_count:00005}.{args.extension}" if not file_complete: file_mode = 'a' if os.path.isfile(output_filename) else 'w' with open(output_filename, file_mode) as small_file: small_file.write(line) with open(output_filename, 'r') as small_file: small_file.seek(0, os.SEEK_END) output_file_size = small_file.tell() if output_file_size > size_limit: file_complete = True else: file_complete = False file_count += 1 print(f"Large file {args.large_file} chunked into {file_count} files no bigger than {args.size}") if __name__ == "__main__": main()