Source code for NuRadioMC.utilities.split_hdf5
import os
import argparse
import logging
import math
from six import iteritems
import numpy as np
import h5py
logger = logging.getLogger("HDF5-split")
logging.basicConfig(level=logging.DEBUG)
logger.setLevel(logging.WARNING)
[docs]def split_hdf5_input_file(input_filename, output_filename, number_of_events_per_file):
"""
splits up an existing hdf5 file into multiple subfiles
Parameters
----------
input_filename: string
the input filename
output_filename: string
the desired output filename (if multiple files are generated, a 'part000x' is appended to the filename
n_events_per_file: int (optional, default None)
the number of events per file
"""
fin = h5py.File(input_filename, 'r')
attributes = {}
data = {}
groups = {}
n_groups = {}
n_data = {}
group_attrs = {}
for key in fin:
if isinstance(fin[key], h5py._hl.group.Group):
groups[key] = {}
if(key not in n_groups):
n_groups[key] = {}
for key2 in fin[key]:
groups[key][key2] = fin[key][key2][...]
if(key2 not in n_groups[key]):
n_groups[key][key2] = 0
n_groups[key][key2] += len(groups[key][key2])
if(key not in group_attrs):
group_attrs[key] = {}
for key2 in fin[key].attrs:
group_attrs[key][key2] = fin[key].attrs[key2]
else:
for key2 in fin[key].attrs:
if(not np.all(group_attrs[key][key2] == fin[key].attrs[key2])):
logger.warning(f"attribute {key2} of group {key} of file {input_filename} are different ({group_attrs[key][key2]} vs. {fin[key].attrs[key2]}. Using attribute value of first file, but you have been warned!")
else:
data[key] = fin[key][...]
if(key not in n_data):
n_data[key] = 0
n_data[key] += len(data[key])
for key, value in iteritems(fin.attrs):
attributes[key] = value
# logger.info(f"setting number of events from {attributes['n_events']} to the actual number of events in this file {len(data_sets['event_ids'])}")
fin.close()
n_events = len(data['event_ids'])
logger.info("saving {} events in total".format(n_events))
total_number_of_events = attributes['n_events']
n_files = math.ceil(n_events / number_of_events_per_file)
for iFile in range(n_files):
filename2 = output_filename + ".part{:04}".format(iFile)
logger.debug(f"saving file {iFile} with {number_of_events_per_file} events to {filename2}")
fout = h5py.File(filename2, 'w')
for key, value in attributes.items():
fout.attrs[key] = value
fout.attrs['total_number_of_events'] = total_number_of_events / n_files
fout.attrs['n_events'] = number_of_events_per_file
for key, value in data.items():
if value.dtype.kind == 'U':
fout[key] = [np.char.encode(c, 'utf8') for c in value[iFile * number_of_events_per_file:(iFile + 1) * number_of_events_per_file]]
else:
fout[key] = value[iFile * number_of_events_per_file:(iFile + 1) * number_of_events_per_file]
i1, i2 = iFile * number_of_events_per_file, (iFile + 1) * number_of_events_per_file
for key in groups:
logger.info("writing group {}".format(key))
g = fout.create_group(key)
for key2 in groups[key]:
logger.info("writing data set {}".format(key2))
shape = list(groups[key][key2][i1:i2].shape)
# shape[0] = n_groups[key][key2]
tmp = groups[key][key2][i1:i2]
g.create_dataset(key2, shape, dtype=groups[key][key2].dtype,
compression='gzip')[...] = tmp
# save group attributes
for key2 in group_attrs[key]:
fout[key].attrs[key2] = group_attrs[key][key2]
fout.close()
if __name__ == "__main__":
"""
Splits up a hdf5 file into multiple files
Parameters
----------
file: str
the input file
outputfolder: str
the ouput folder
n_events: int
the maximum number of events in each file. The last file will contain less events.
Optional log level setting to either set DEBUG, INFO, or WARNING to the readout. Example: add --loglevel DEBUG when calling script to set loglevel to DEBUG.
"""
parser = argparse.ArgumentParser(description='Merge hdf5 files')
parser.add_argument('file', type=str, help='input file')
parser.add_argument('outputfolder', type=str, help='output folder')
parser.add_argument('n_events', type=int, help='number of events per file')
parser.add_argument('--loglevel', metavar='level', help='loglevel set to either DEBUG, INFO, or WARNING')
args = parser.parse_args()
if args.loglevel is not None:
log_val = eval(f'logging.{args.loglevel}')
logger.setLevel(log_val)
if(not os.path.exists(args.outputfolder)):
os.makedirs(args.outputfolder)
input_filename = os.path.basename(args.file)
split_hdf5_input_file(args.file, os.path.join(args.outputfolder, input_filename), args.n_events)