Source code for pycbc.io.hdf

# convenience classes for accessing hdf5 trigger files
# the 'get_column()' method is implemented parallel to
# the existing pylal.SnglInspiralUtils functions

import h5py
import numpy as np
import logging
import inspect
from itertools import chain
from six.moves import range

from lal import LIGOTimeGPS, YRJUL_SI

from glue.ligolw import ligolw
from glue.ligolw import lsctables
from glue.ligolw import utils as ligolw_utils
from glue.ligolw.utils import process as ligolw_process

from pycbc import version as pycbc_version
from pycbc.tmpltbank import return_search_summary
from pycbc.tmpltbank import return_empty_sngl
from pycbc import events, conversions, pnutils
from pycbc.events import ranking
from pycbc.events.stat import sngl_statistic_dict

[docs]class HFile(h5py.File): """ Low level extensions to the capabilities of reading an hdf5 File """
[docs] def select(self, fcn, *args, **kwds): """ Return arrays from an hdf5 file that satisfy the given function Parameters ---------- fcn : a function A function that accepts the same number of argument as keys given and returns a boolean array of the same length. args : strings A variable number of strings that are keys into the hdf5. These must refer to arrays of equal length. chunksize : {1e6, int}, optional Number of elements to read and process at a time. return_indices : bool, optional If True, also return the indices of elements passing the function. Returns ------- values : np.ndarrays A variable number of arrays depending on the number of keys into the hdf5 file that are given. If return_indices is True, the first element is an array of indices of elements passing the function. >>> f = HFile(filename) >>> snr = f.select(lambda snr: snr > 6, 'H1/snr') """ # get references to each array refs = {} data = {} for arg in args: refs[arg] = self[arg] data[arg] = [] return_indices = kwds.get('return_indices', False) indices = np.array([], dtype=np.uint64) # To conserve memory read the array in chunks chunksize = kwds.get('chunksize', int(1e6)) size = len(refs[arg]) i = 0 while i < size: r = i + chunksize if i + chunksize < size else size #Read each chunks worth of data and find where it passes the function partial = [refs[arg][i:r] for arg in args] keep = fcn(*partial) if return_indices: indices = np.concatenate([indices, np.flatnonzero(keep) + i]) #store only the results that pass the function for arg, part in zip(args, partial): data[arg].append(part[keep]) i += chunksize # Combine the partial results into full arrays if len(args) == 1: res = np.concatenate(data[args[0]]) if return_indices: return indices.astype(np.uint64), res else: return res else: res = tuple(np.concatenate(data[arg]) for arg in args) if return_indices: return (indices.astype(np.uint64),) + res else: return res
[docs]class DictArray(object): """ Utility for organizing sets of arrays of equal length. Manages a dictionary of arrays of equal length. This can also be instantiated with a set of hdf5 files and the key values. The full data is always in memory and all operations create new instances of the DictArray. """ def __init__(self, data=None, files=None, groups=None): """ Create a DictArray Parameters ---------- data: dict, optional Dictionary of equal length numpy arrays files: list of filenames, optional List of hdf5 file filenames. Incompatibile with the `data` option. groups: list of strings List of keys into each file. Required by the files option. """ # Check that input fits with how the DictArray is set up if data and files: raise RuntimeError('DictArray can only have data or files as ' 'input, not both.') if files and not groups: raise RuntimeError('If files are given then need groups.') self.data = data self.groups = groups if files: self.data = {} for g in groups: self.data[g] = [] for f in files: d = HFile(f) for g in groups: if g in d: self.data[g].append(d[g][:]) d.close() for k in self.data: if not len(self.data[k]) == 0: self.data[k] = np.concatenate(self.data[k]) for k in self.data: setattr(self, k, self.data[k]) def _return(self, data): return self.__class__(data=data) def __len__(self): return len(self.data[tuple(self.data.keys())[0]]) def __add__(self, other): data = {} for k in self.data: try: data[k] = np.concatenate([self.data[k], other.data[k]]) except KeyError: logging.info('%s does not exist in other data' % k) return self._return(data=data)
[docs] def select(self, idx): """ Return a new DictArray containing only the indexed values """ data = {} for k in self.data: data[k] = self.data[k][idx] return self._return(data=data)
[docs] def remove(self, idx): """ Return a new DictArray that does not contain the indexed values """ data = {} for k in self.data: data[k] = np.delete(self.data[k], idx) return self._return(data=data)
[docs] def save(self, outname): f = HFile(outname, "w") for k in self.attrs: f.attrs[k] = self.attrs[k] for k in self.data: f.create_dataset(k, data=self.data[k], compression='gzip', compression_opts=9, shuffle=True) f.close()
[docs]class StatmapData(DictArray): def __init__(self, data=None, seg=None, attrs=None, files=None, groups=('stat', 'time1', 'time2', 'trigger_id1', 'trigger_id2', 'template_id', 'decimation_factor', 'timeslide_id')): super(StatmapData, self).__init__(data=data, files=files, groups=groups) if data: self.seg=seg self.attrs=attrs elif files: f = HFile(files[0], "r") self.seg = f['segments'] self.attrs = f.attrs def _return(self, data): return self.__class__(data=data, attrs=self.attrs, seg=self.seg)
[docs] def cluster(self, window): """ Cluster the dict array, assuming it has the relevant Coinc colums, time1, time2, stat, and timeslide_id """ # If no events, do nothing if len(self.time1) == 0 or len(self.time2) == 0: return self from pycbc.events import cluster_coincs interval = self.attrs['timeslide_interval'] cid = cluster_coincs(self.stat, self.time1, self.time2, self.timeslide_id, interval, window) return self.select(cid)
[docs] def save(self, outname): super(StatmapData, self).save(outname) with HFile(outname, "w") as f: for key in self.seg.keys(): f['segments/%s/start' % key] = self.seg[key]['start'][:] f['segments/%s/end' % key] = self.seg[key]['end'][:]
[docs]class MultiifoStatmapData(StatmapData): def __init__(self, data=None, seg=None, attrs=None, files=None, ifos=None): groups = ['decimation_factor', 'stat', 'template_id', 'timeslide_id'] for ifo in ifos: groups += ['%s/time' % ifo] groups += ['%s/trigger_id' % ifo] super(MultiifoStatmapData, self).__init__(data=data, files=files, groups=groups, attrs=attrs, seg=seg) def _return(self, data): ifolist = self.attrs['ifos'].split(' ') return self.__class__(data=data, attrs=self.attrs, seg=self.seg, ifos=ifolist)
[docs] def cluster(self, window): """ Cluster the dict array, assuming it has the relevant Coinc colums, time1, time2, stat, and timeslide_id """ # If no events, do nothing pivot_ifo = self.attrs['pivot'] fixed_ifo = self.attrs['fixed'] if len(self.data['%s/time' % pivot_ifo]) == 0 or len(self.data['%s/time' % fixed_ifo]) == 0: return self from pycbc.events import cluster_coincs interval = self.attrs['timeslide_interval'] cid = cluster_coincs(self.stat, self.data['%s/time' % pivot_ifo], self.data['%s/time' % fixed_ifo], self.timeslide_id, interval, window) return self.select(cid)
[docs]class FileData(object): def __init__(self, fname, group=None, columnlist=None, filter_func=None): """ Parameters ---------- group : string Name of group to be read from the file columnlist : list of strings Names of columns to be read; if None, use all existing columns filter_func : string String should evaluate to a Boolean expression using attributes of the class instance derived from columns: ex. 'self.snr < 6.5' """ if not fname: raise RuntimeError("Didn't get a file!") self.fname = fname self.h5file = HFile(fname, "r") if group is None: if len(self.h5file.keys()) == 1: group, = self.h5file.keys() else: raise RuntimeError("Didn't get a group!") self.group_key = group self.group = self.h5file[group] self.columns = columnlist if columnlist is not None \ else list(self.group.keys()) self.filter_func = filter_func self._mask = None
[docs] def close(self): self.h5file.close()
@property def mask(self): """ Create a mask implementing the requested filter on the datasets Returns ------- array of Boolean True for dataset indices to be returned by the get_column method """ if self.filter_func is None: raise RuntimeError("Can't get a mask without a filter function!") else: # only evaluate if no previous calculation was done if self._mask is None: # get required columns into the namespace as numpy arrays for column in self.columns: if column in self.filter_func: setattr(self, column, self.group[column][:]) self._mask = eval(self.filter_func) return self._mask
[docs] def get_column(self, col): """ Parameters ---------- col : string Name of the dataset to be returned Returns ------- numpy array Values from the dataset, filtered if requested """ # catch corner case with an empty file (group with no datasets) if not len(self.group.keys()): return np.array([]) vals = self.group[col] if self.filter_func: return vals[self.mask] else: return vals[:]
[docs]class DataFromFiles(object): def __init__(self, filelist, group=None, columnlist=None, filter_func=None): self.files = filelist self.group = group self.columns = columnlist self.filter_func = filter_func
[docs] def get_column(self, col): """ Loop over files getting the requested dataset values from each Parameters ---------- col : string Name of the dataset to be returned Returns ------- numpy array Values from the dataset, filtered if requested and concatenated in order of file list """ logging.info('getting %s' % col) vals = [] for f in self.files: d = FileData(f, group=self.group, columnlist=self.columns, filter_func=self.filter_func) vals.append(d.get_column(col)) # Close each file since h5py has an upper limit on the number of # open file objects (approx. 1000) d.close() logging.info('- got %i values' % sum(len(v) for v in vals)) return np.concatenate(vals)
[docs]class SingleDetTriggers(object): """ Provides easy access to the parameters of single-detector CBC triggers. """ # FIXME: Some of these are optional and should be kwargs. def __init__(self, trig_file, bank_file, veto_file, segment_name, filter_func, detector, premask=None): logging.info('Loading triggers') self.trigs_f = HFile(trig_file, 'r') self.trigs = self.trigs_f[detector] self.ifo = detector # convenience attributes self.detector = detector if bank_file: logging.info('Loading bank') self.bank = HFile(bank_file, 'r') else: logging.info('No bank file given') # empty dict in place of non-existent hdf file self.bank = {} if premask is not None: self.mask = premask else: self.mask = np.ones(len(self.trigs['end_time']), dtype=bool) if veto_file: logging.info('Applying veto segments') # veto_mask is an array of indices into the trigger arrays # giving the surviving triggers logging.info('%i triggers before vetoes', self.mask.sum()) self.veto_mask, _ = events.veto.indices_outside_segments( self.end_time, [veto_file], ifo=detector, segment_name=segment_name) idx = np.flatnonzero(self.mask)[self.veto_mask] self.mask[:] = False self.mask[idx] = True logging.info('%i triggers remain after vetoes', len(self.veto_mask)) # FIXME this should use the hfile select interface to avoid # memory and processing limitations. if filter_func: # get required columns into the namespace with dummy attribute # names to avoid confusion with other class properties logging.info('Setting up filter function') for c in self.trigs.keys(): if c in filter_func: setattr(self, '_'+c, self.trigs[c][:]) for c in self.bank.keys(): if c in filter_func: # get template parameters corresponding to triggers setattr(self, '_'+c, np.array(self.bank[c])[self.trigs['template_id'][:]]) self.filter_mask = eval(filter_func.replace('self.', 'self._')) # remove the dummy attributes for c in chain(self.trigs.keys(), self.bank.keys()): if c in filter_func: delattr(self, '_'+c) self.mask = self.mask & self.filter_mask logging.info('%i triggers remain after cut on %s', sum(self.mask), filter_func)
[docs] def checkbank(self, param): if self.bank == {}: return RuntimeError("Can't get %s values without a bank file" % param)
[docs] def trig_dict(self): """Returns dict of the masked trigger valuse """ mtrigs = {} for k in self.trigs: if len(self.trigs[k]) == len(self.trigs['end_time']): if self.mask is not None: mtrigs[k] = self.trigs[k][self.mask] else: mtrigs[k] = self.trigs[k][:] return mtrigs
[docs] @classmethod def get_param_names(cls): """Returns a list of plottable CBC parameter variables""" return [m[0] for m in inspect.getmembers(cls) \ if type(m[1]) == property]
[docs] def apply_mask(self, logic_mask): """Apply a boolean array to the set of triggers""" if hasattr(self.mask, 'dtype') and (self.mask.dtype == 'bool'): orig_indices = self.mask.nonzero()[0][logic_mask] self.mask[:] = False self.mask[orig_indices] = True else: self.mask = list(np.array(self.mask)[logic_mask])
[docs] def mask_to_n_loudest_clustered_events(self, n_loudest=10, ranking_statistic="newsnr", cluster_window=10, statistic_files=None): """Edits the mask property of the class to point to the N loudest single detector events as ranked by ranking statistic. Events are clustered so that no more than 1 event within +/- cluster-window will be considered.""" if statistic_files is None: statistic_files = [] # If this becomes memory intensive we can optimize stat_instance = sngl_statistic_dict[ranking_statistic](statistic_files) stat = stat_instance.single(self.trig_dict()) # Used for naming in plots ... Seems an odd place for this to live! if ranking_statistic == "newsnr": self.stat_name = "Reweighted SNR" elif ranking_statistic == "newsnr_sgveto": self.stat_name = "Reweighted SNR (+sgveto)" elif ranking_statistic == "newsnr_sgveto_psdvar": self.stat_name = "Reweighted SNR (+sgveto+psdvar)" elif ranking_statistic == "snr": self.stat_name = "SNR" else: self.stat_name = ranking_statistic times = self.end_time index = stat.argsort()[::-1] new_times = [] new_index = [] for curr_idx in index: curr_time = times[curr_idx] for time in new_times: if abs(curr_time - time) < cluster_window: break else: # Only get here if no other triggers within cluster window new_index.append(curr_idx) new_times.append(curr_time) if len(new_index) >= n_loudest: break index = np.array(new_index) index.sort() self.stat = stat[index] if hasattr(self.mask, 'dtype') and self.mask.dtype == 'bool': orig_indices = np.flatnonzero(self.mask)[index] self.mask = list(orig_indices) elif isinstance(self.mask, list): self.mask = list(np.array(self.mask)[index])
@property def template_id(self): return self.get_column('template_id') @property def mass1(self): self.checkbank('mass1') return self.bank['mass1'][:][self.template_id] @property def mass2(self): self.checkbank('mass2') return self.bank['mass2'][:][self.template_id] @property def spin1z(self): self.checkbank('spin1z') return self.bank['spin1z'][:][self.template_id] @property def spin2z(self): self.checkbank('spin2z') return self.bank['spin2z'][:][self.template_id] @property def spin2x(self): self.checkbank('spin2x') return self.bank['spin2x'][:][self.template_id] @property def spin2y(self): self.checkbank('spin2y') return self.bank['spin2y'][:][self.template_id] @property def spin1x(self): self.checkbank('spin1x') return self.bank['spin1x'][:][self.template_id] @property def spin1y(self): self.checkbank('spin1y') return self.bank['spin1y'][:][self.template_id] @property def inclination(self): self.checkbank('inclination') return self.bank['inclination'][:][self.template_id] @property def f_lower(self): self.checkbank('f_lower') return self.bank['f_lower'][:][self.template_id] @property def mtotal(self): return self.mass1 + self.mass2 @property def mchirp(self): return conversions.mchirp_from_mass1_mass2(self.mass1, self.mass2) @property def eta(self): return conversions.eta_from_mass1_mass2(self.mass1, self.mass2) @property def effective_spin(self): # FIXME assumes aligned spins return conversions.chi_eff(self.mass1, self.mass2, self.spin1z, self.spin2z) # IMPROVEME: would like to have a way to access all get_freq and/or # other pnutils.* names rather than hard-coding each one # - eg make this part of a fancy interface to the bank file ? @property def f_seobnrv2_peak(self): return pnutils.get_freq('fSEOBNRv2Peak', self.mass1, self.mass2, self.spin1z, self.spin2z) @property def f_seobnrv4_peak(self): return pnutils.get_freq('fSEOBNRv4Peak', self.mass1, self.mass2, self.spin1z, self.spin2z) @property def end_time(self): return self.get_column('end_time') @property def template_duration(self): return self.get_column('template_duration') @property def snr(self): return self.get_column('snr') @property def sgchisq(self): return self.get_column('sg_chisq') @property def u_vals(self): return self.get_column('u_vals') @property def rchisq(self): return self.get_column('chisq') \ / (self.get_column('chisq_dof') * 2 - 2) @property def psd_var_val(self): return self.get_column('psd_var_val') @property def newsnr(self): return ranking.newsnr(self.snr, self.rchisq) @property def newsnr_sgveto(self): return ranking.newsnr_sgveto(self.snr, self.rchisq, self.sgchisq) @property def newsnr_sgveto_psdvar(self): return ranking.newsnr_sgveto_psdvar(self.snr, self.rchisq, self.sgchisq, self.psd_var_val)
[docs] def get_column(self, cname): # Fiducial value that seems to work, not extensively tuned. MFRAC = 0.3 # If the mask accesses few enough elements then directly use it # This can be slower than reading in all the elements if most of them # will be read. if self.mask is not None and (isinstance(self.mask, list) or \ (len(self.mask.nonzero()[0]) < (len(self.mask) * MFRAC))): return self.trigs[cname][self.mask] # We have a lot of elements to read so we resort to readin the entire # array before masking. elif self.mask is not None: return self.trigs[cname][:][self.mask] else: return self.trigs[cname][:]
[docs]class ForegroundTriggers(object): # FIXME: A lot of this is hardcoded to expect two ifos def __init__(self, coinc_file, bank_file, sngl_files=None, n_loudest=None, group='foreground'): self.coinc_file = FileData(coinc_file, group=group) self.sngl_files = {} if sngl_files is not None: for file in sngl_files: curr_dat = FileData(file) curr_ifo = curr_dat.group_key self.sngl_files[curr_ifo] = curr_dat self.bank_file = HFile(bank_file, "r") self.n_loudest = n_loudest self._sort_arr = None self._template_id = None self._trig_ids = None @property def sort_arr(self): if self._sort_arr is None: ifar = self.coinc_file.get_column('ifar') sorting = ifar.argsort()[::-1] if self.n_loudest: sorting = sorting[:self.n_loudest] self._sort_arr = sorting return self._sort_arr @property def template_id(self): if self._template_id is None: template_id = self.get_coincfile_array('template_id') self._template_id = template_id return self._template_id @property def trig_id(self): if self._trig_ids is not None: return self._trig_ids self._trig_ids = {} # FIXME: There is no clear mapping from trig_id to ifo. This is bad!!! # for now a hack is in place. ifo1 = self.coinc_file.h5file.attrs['detector_1'] ifo2 = self.coinc_file.h5file.attrs['detector_2'] trigid1 = self.get_coincfile_array('trigger_id1') trigid2 = self.get_coincfile_array('trigger_id2') self._trig_ids[ifo1] = trigid1 self._trig_ids[ifo2] = trigid2 return self._trig_ids
[docs] def get_coincfile_array(self, variable): return self.coinc_file.get_column(variable)[self.sort_arr]
[docs] def get_bankfile_array(self, variable): try: return self.bank_file[variable][:][self.template_id] except IndexError: if len(self.template_id) == 0: return np.array([]) raise
[docs] def get_snglfile_array_dict(self, variable): return_dict = {} for ifo in self.sngl_files.keys(): try: curr = self.sngl_files[ifo].get_column(variable)[\ self.trig_id[ifo]] except IndexError: if len(self.trig_id[ifo]) == 0: curr = np.array([]) else: raise return_dict[ifo] = curr return return_dict
[docs] def to_coinc_xml_object(self, file_name): # FIXME: This function will only work with two ifos!! outdoc = ligolw.Document() outdoc.appendChild(ligolw.LIGO_LW()) ifos = list(self.sngl_files.keys()) proc_id = ligolw_process.register_to_xmldoc(outdoc, 'pycbc', {}, ifos=ifos, comment='', version=pycbc_version.git_hash, cvs_repository='pycbc/'+pycbc_version.git_branch, cvs_entry_time=pycbc_version.date).process_id search_summ_table = lsctables.New(lsctables.SearchSummaryTable) coinc_h5file = self.coinc_file.h5file start_time = coinc_h5file['segments']['coinc']['start'][:].min() end_time = coinc_h5file['segments']['coinc']['end'][:].max() num_trigs = len(self.sort_arr) search_summary = return_search_summary(start_time, end_time, num_trigs, ifos) search_summ_table.append(search_summary) outdoc.childNodes[0].appendChild(search_summ_table) sngl_inspiral_table = lsctables.New(lsctables.SnglInspiralTable) coinc_def_table = lsctables.New(lsctables.CoincDefTable) coinc_event_table = lsctables.New(lsctables.CoincTable) coinc_inspiral_table = lsctables.New(lsctables.CoincInspiralTable) coinc_event_map_table = lsctables.New(lsctables.CoincMapTable) time_slide_table = lsctables.New(lsctables.TimeSlideTable) # Set up time_slide table time_slide_id = lsctables.TimeSlideID(0) for ifo in ifos: time_slide_row = lsctables.TimeSlide() time_slide_row.instrument = ifo time_slide_row.time_slide_id = time_slide_id time_slide_row.offset = 0 time_slide_row.process_id = proc_id time_slide_table.append(time_slide_row) # Set up coinc_definer table coinc_def_id = lsctables.CoincDefID(0) coinc_def_row = lsctables.CoincDef() coinc_def_row.search = "inspiral" coinc_def_row.description = "sngl_inspiral<-->sngl_inspiral coincidences" coinc_def_row.coinc_def_id = coinc_def_id coinc_def_row.search_coinc_type = 0 coinc_def_table.append(coinc_def_row) bank_col_names = ['mass1', 'mass2', 'spin1z', 'spin2z'] bank_col_vals = {} for name in bank_col_names: bank_col_vals[name] = self.get_bankfile_array(name) coinc_event_names = ['ifar', 'time1', 'fap', 'stat'] coinc_event_vals = {} for name in coinc_event_names: coinc_event_vals[name] = self.get_coincfile_array(name) sngl_col_names = ['snr', 'chisq', 'chisq_dof', 'bank_chisq', 'bank_chisq_dof', 'cont_chisq', 'cont_chisq_dof', 'end_time', 'template_duration', 'coa_phase', 'sigmasq'] sngl_col_vals = {} for name in sngl_col_names: sngl_col_vals[name] = self.get_snglfile_array_dict(name) for idx in range(len(self.sort_arr)): # Set up IDs and mapping values coinc_id = lsctables.CoincID(idx) # Set up sngls # FIXME: As two-ifo is hardcoded loop over all ifos sngl_combined_mchirp = 0 sngl_combined_mtot = 0 for ifo in ifos: sngl_id = self.trig_id[ifo][idx] event_id = lsctables.SnglInspiralID(sngl_id) sngl = return_empty_sngl() sngl.event_id = event_id sngl.ifo = ifo for name in sngl_col_names: val = sngl_col_vals[name][ifo][idx] if name == 'end_time': sngl.set_end(LIGOTimeGPS(val)) else: setattr(sngl, name, val) for name in bank_col_names: val = bank_col_vals[name][idx] setattr(sngl, name, val) sngl.mtotal, sngl.eta = pnutils.mass1_mass2_to_mtotal_eta( sngl.mass1, sngl.mass2) sngl.mchirp, _ = pnutils.mass1_mass2_to_mchirp_eta( sngl.mass1, sngl.mass2) sngl.eff_distance = (sngl.sigmasq)**0.5 / sngl.snr sngl_combined_mchirp += sngl.mchirp sngl_combined_mtot += sngl.mtotal sngl_inspiral_table.append(sngl) # Set up coinc_map entry coinc_map_row = lsctables.CoincMap() coinc_map_row.table_name = 'sngl_inspiral' coinc_map_row.coinc_event_id = coinc_id coinc_map_row.event_id = event_id coinc_event_map_table.append(coinc_map_row) sngl_combined_mchirp = sngl_combined_mchirp / len(ifos) sngl_combined_mtot = sngl_combined_mtot / len(ifos) # Set up coinc inspiral and coinc event tables coinc_event_row = lsctables.Coinc() coinc_inspiral_row = lsctables.CoincInspiral() coinc_event_row.coinc_def_id = coinc_def_id coinc_event_row.nevents = len(ifos) coinc_event_row.instruments = ','.join(ifos) coinc_inspiral_row.set_ifos(ifos) coinc_event_row.time_slide_id = time_slide_id coinc_event_row.process_id = proc_id coinc_event_row.coinc_event_id = coinc_id coinc_inspiral_row.coinc_event_id = coinc_id coinc_inspiral_row.mchirp = sngl_combined_mchirp coinc_inspiral_row.mass = sngl_combined_mtot coinc_inspiral_row.set_end(\ LIGOTimeGPS(coinc_event_vals['time1'][idx])) coinc_inspiral_row.snr = coinc_event_vals['stat'][idx] coinc_inspiral_row.false_alarm_rate = coinc_event_vals['fap'][idx] coinc_inspiral_row.combined_far = 1./coinc_event_vals['ifar'][idx] # Transform to Hz coinc_inspiral_row.combined_far = \ coinc_inspiral_row.combined_far / YRJUL_SI coinc_event_row.likelihood = 0. coinc_inspiral_row.minimum_duration = 0. coinc_event_table.append(coinc_event_row) coinc_inspiral_table.append(coinc_inspiral_row) outdoc.childNodes[0].appendChild(coinc_def_table) outdoc.childNodes[0].appendChild(coinc_event_table) outdoc.childNodes[0].appendChild(coinc_event_map_table) outdoc.childNodes[0].appendChild(time_slide_table) outdoc.childNodes[0].appendChild(coinc_inspiral_table) outdoc.childNodes[0].appendChild(sngl_inspiral_table) ligolw_utils.write_filename(outdoc, file_name)
chisq_choices = ['traditional', 'cont', 'bank', 'max_cont_trad', 'sg', 'max_bank_cont', 'max_bank_trad', 'max_bank_cont_trad']
[docs]def get_chisq_from_file_choice(hdfile, chisq_choice): f = hdfile if chisq_choice in ['traditional','max_cont_trad', 'max_bank_trad', 'max_bank_cont_trad']: trad_chisq = f['chisq'][:] # We now need to handle the case where chisq is not actually calculated # 0 is used as a sentinel value trad_chisq_dof = f['chisq_dof'][:] trad_chisq /= (trad_chisq_dof * 2 - 2) if chisq_choice in ['cont', 'max_cont_trad', 'max_bank_cont', 'max_bank_cont_trad']: cont_chisq = f['cont_chisq'][:] cont_chisq_dof = f['cont_chisq_dof'][:] cont_chisq /= cont_chisq_dof if chisq_choice in ['bank', 'max_bank_cont', 'max_bank_trad', 'max_bank_cont_trad']: bank_chisq = f['bank_chisq'][:] bank_chisq_dof = f['bank_chisq_dof'][:] bank_chisq /= bank_chisq_dof if chisq_choice == 'sg': chisq = f['sg_chisq'][:] elif chisq_choice == 'traditional': chisq = trad_chisq elif chisq_choice == 'cont': chisq = cont_chisq elif chisq_choice == 'bank': chisq = bank_chisq elif chisq_choice == 'max_cont_trad': chisq = np.maximum(trad_chisq, cont_chisq) elif chisq_choice == 'max_bank_cont': chisq = np.maximum(bank_chisq, cont_chisq) elif chisq_choice == 'max_bank_trad': chisq = np.maximum(bank_chisq, trad_chisq) elif chisq_choice == 'max_bank_cont_trad': chisq = np.maximum(np.maximum(bank_chisq, cont_chisq), trad_chisq) else: err_msg = "Do not recognize --chisq-choice %s" % chisq_choice raise ValueError(err_msg) return chisq
[docs]def save_dict_to_hdf5(dic, filename): """ Parameters ---------- dic: python dictionary to be converted to hdf5 format filename: desired name of hdf5 file """ with h5py.File(filename, 'w') as h5file: recursively_save_dict_contents_to_group(h5file, '/', dic)
[docs]def recursively_save_dict_contents_to_group(h5file, path, dic): """ Parameters ---------- h5file: h5py file to be written to path: path within h5py file to saved dictionary dic: python dictionary to be converted to hdf5 format """ for key, item in dic.items(): if isinstance(item, (np.ndarray, np.int64, np.float64, str, bytes, tuple, list)): h5file[path + str(key)] = item elif isinstance(item, dict): recursively_save_dict_contents_to_group(h5file, path + key + '/', item) else: raise ValueError('Cannot save %s type' % type(item))
[docs]def combine_and_copy(f, files, group): """ Combine the same column from multiple files and save to a third""" # ensure that the files input is stable for iteration order assert isinstance(files, (list, tuple)) f[group] = np.concatenate([fi[group][:] if group in fi else \ np.array([], dtype=np.uint32) for fi in files])
[docs]def name_all_datasets(files): assert isinstance(files, (list, tuple)) datasets = [] for fi in files: datasets += get_all_subkeys(fi, '/') return set(datasets)
[docs]def get_all_subkeys(grp, key): subkey_list = [] subkey_start = key if key == '': grpk = grp else: grpk = grp[key] for sk in grpk.keys(): path = subkey_start + '/' + sk if isinstance(grp[path], h5py.Dataset): subkey_list.append(path.lstrip('/')) else: subkey_list += get_all_subkeys(grp, path) # returns an empty list if there is no dataset or subgroup within the group return subkey_list