Source code for pycbc.io.hdf

# convenience classes for accessing hdf5 trigger files
# the 'get_column()' method is implemented parallel to
# the existing pylal.SnglInspiralUtils functions

import h5py
import numpy as np
import logging
import inspect
import pickle

from itertools import chain
from io import BytesIO
from lal import LIGOTimeGPS, YRJUL_SI

from ligo.lw import ligolw
from ligo.lw import lsctables
from ligo.lw import utils as ligolw_utils
from ligo.lw.utils import process as ligolw_process

from pycbc import version as pycbc_version
from pycbc.io.ligolw import return_search_summary, return_empty_sngl
from pycbc import events, conversions, pnutils
from pycbc.events import ranking, veto
from pycbc.events import mean_if_greater_than_zero
from pycbc.pnutils import mass1_mass2_to_mchirp_eta

[docs]class HFile(h5py.File): """ Low level extensions to the capabilities of reading an hdf5 File """
[docs] def select(self, fcn, *args, **kwds): """ Return arrays from an hdf5 file that satisfy the given function Parameters ---------- fcn : a function A function that accepts the same number of argument as keys given and returns a boolean array of the same length. args : strings A variable number of strings that are keys into the hdf5. These must refer to arrays of equal length. chunksize : {1e6, int}, optional Number of elements to read and process at a time. return_indices : bool, optional If True, also return the indices of elements passing the function. Returns ------- values : np.ndarrays A variable number of arrays depending on the number of keys into the hdf5 file that are given. If return_indices is True, the first element is an array of indices of elements passing the function. >>> f = HFile(filename) >>> snr = f.select(lambda snr: snr > 6, 'H1/snr') """ # get references to each array refs = {} data = {} for arg in args: refs[arg] = self[arg] data[arg] = [] return_indices = kwds.get('return_indices', False) indices = np.array([], dtype=np.uint64) # To conserve memory read the array in chunks chunksize = kwds.get('chunksize', int(1e6)) size = len(refs[arg]) i = 0 while i < size: r = i + chunksize if i + chunksize < size else size #Read each chunks worth of data and find where it passes the function partial = [refs[arg][i:r] for arg in args] keep = fcn(*partial) if return_indices: indices = np.concatenate([indices, np.flatnonzero(keep) + i]) #store only the results that pass the function for arg, part in zip(args, partial): data[arg].append(part[keep]) i += chunksize # Combine the partial results into full arrays if len(args) == 1: res = np.concatenate(data[args[0]]) if return_indices: return indices.astype(np.uint64), res else: return res else: res = tuple(np.concatenate(data[arg]) for arg in args) if return_indices: return (indices.astype(np.uint64),) + res else: return res
[docs]class DictArray(object): """ Utility for organizing sets of arrays of equal length. Manages a dictionary of arrays of equal length. This can also be instantiated with a set of hdf5 files and the key values. The full data is always in memory and all operations create new instances of the DictArray. """ def __init__(self, data=None, files=None, groups=None): """ Create a DictArray Parameters ---------- data: dict, optional Dictionary of equal length numpy arrays files: list of filenames, optional List of hdf5 file filenames. Incompatibile with the `data` option. groups: list of strings List of keys into each file. Required by the files option. """ # Check that input fits with how the DictArray is set up if data and files: raise RuntimeError('DictArray can only have data or files as ' 'input, not both.') if files and not groups: raise RuntimeError('If files are given then need groups.') self.data = data self.groups = groups if files: self.data = {} for g in groups: self.data[g] = [] for f in files: d = HFile(f) for g in groups: if g in d: self.data[g].append(d[g][:]) d.close() for k in self.data: if not len(self.data[k]) == 0: self.data[k] = np.concatenate(self.data[k]) for k in self.data: setattr(self, k, self.data[k]) def _return(self, data): return self.__class__(data=data) def __len__(self): return len(self.data[tuple(self.data.keys())[0]]) def __add__(self, other): data = {} for k in self.data: try: data[k] = np.concatenate([self.data[k], other.data[k]]) except KeyError: logging.info('%s does not exist in other data' % k) return self._return(data=data)
[docs] def select(self, idx): """ Return a new DictArray containing only the indexed values """ data = {} for k in self.data: data[k] = self.data[k][idx] return self._return(data=data)
[docs] def remove(self, idx): """ Return a new DictArray that does not contain the indexed values """ data = {} for k in self.data: data[k] = np.delete(self.data[k], np.array(idx, dtype=int)) return self._return(data=data)
[docs] def save(self, outname): f = HFile(outname, "w") for k in self.attrs: f.attrs[k] = self.attrs[k] for k in self.data: f.create_dataset(k, data=self.data[k], compression='gzip', compression_opts=9, shuffle=True) f.close()
[docs]class StatmapData(DictArray): def __init__(self, data=None, seg=None, attrs=None, files=None, groups=('stat', 'time1', 'time2', 'trigger_id1', 'trigger_id2', 'template_id', 'decimation_factor', 'timeslide_id')): super(StatmapData, self).__init__(data=data, files=files, groups=groups) if data: self.seg=seg self.attrs=attrs elif files: f = HFile(files[0], "r") self.seg = f['segments'] self.attrs = f.attrs def _return(self, data): return self.__class__(data=data, attrs=self.attrs, seg=self.seg)
[docs] def cluster(self, window): """ Cluster the dict array, assuming it has the relevant Coinc colums, time1, time2, stat, and timeslide_id """ # If no events, do nothing if len(self.time1) == 0 or len(self.time2) == 0: return self from pycbc.events import cluster_coincs interval = self.attrs['timeslide_interval'] cid = cluster_coincs(self.stat, self.time1, self.time2, self.timeslide_id, interval, window) return self.select(cid)
[docs] def save(self, outname): super(StatmapData, self).save(outname) with HFile(outname, "w") as f: for key in self.seg.keys(): f['segments/%s/start' % key] = self.seg[key]['start'][:] f['segments/%s/end' % key] = self.seg[key]['end'][:]
[docs]class MultiifoStatmapData(StatmapData): def __init__(self, data=None, seg=None, attrs=None, files=None, ifos=None): groups = ['decimation_factor', 'stat', 'template_id', 'timeslide_id'] for ifo in ifos: groups += ['%s/time' % ifo] groups += ['%s/trigger_id' % ifo] super(MultiifoStatmapData, self).__init__(data=data, files=files, groups=groups, attrs=attrs, seg=seg) def _return(self, data): ifolist = self.attrs['ifos'].split(' ') return self.__class__(data=data, attrs=self.attrs, seg=self.seg, ifos=ifolist)
[docs] def cluster(self, window): """ Cluster the dict array, assuming it has the relevant Coinc colums, time1, time2, stat, and timeslide_id """ # If no events, do nothing pivot_ifo = self.attrs['pivot'] fixed_ifo = self.attrs['fixed'] if len(self.data['%s/time' % pivot_ifo]) == 0 or len(self.data['%s/time' % fixed_ifo]) == 0: return self from pycbc.events import cluster_coincs interval = self.attrs['timeslide_interval'] cid = cluster_coincs(self.stat, self.data['%s/time' % pivot_ifo], self.data['%s/time' % fixed_ifo], self.timeslide_id, interval, window) return self.select(cid)
[docs]class FileData(object): def __init__(self, fname, group=None, columnlist=None, filter_func=None): """ Parameters ---------- group : string Name of group to be read from the file columnlist : list of strings Names of columns to be read; if None, use all existing columns filter_func : string String should evaluate to a Boolean expression using attributes of the class instance derived from columns: ex. 'self.snr < 6.5' """ if not fname: raise RuntimeError("Didn't get a file!") self.fname = fname self.h5file = HFile(fname, "r") if group is None: if len(self.h5file.keys()) == 1: group, = self.h5file.keys() else: raise RuntimeError("Didn't get a group!") self.group_key = group self.group = self.h5file[group] self.columns = columnlist if columnlist is not None \ else list(self.group.keys()) self.filter_func = filter_func self._mask = None
[docs] def close(self): self.h5file.close()
@property def mask(self): """ Create a mask implementing the requested filter on the datasets Returns ------- array of Boolean True for dataset indices to be returned by the get_column method """ if self.filter_func is None: raise RuntimeError("Can't get a mask without a filter function!") else: # only evaluate if no previous calculation was done if self._mask is None: # get required columns into the namespace as numpy arrays for column in self.columns: if column in self.filter_func: setattr(self, column, self.group[column][:]) self._mask = eval(self.filter_func) return self._mask
[docs] def get_column(self, col): """ Parameters ---------- col : string Name of the dataset to be returned Returns ------- numpy array Values from the dataset, filtered if requested """ # catch corner case with an empty file (group with no datasets) if not len(self.group.keys()): return np.array([]) vals = self.group[col] if self.filter_func: return vals[self.mask] else: return vals[:]
[docs]class DataFromFiles(object): def __init__(self, filelist, group=None, columnlist=None, filter_func=None): self.files = filelist self.group = group self.columns = columnlist self.filter_func = filter_func
[docs] def get_column(self, col): """ Loop over files getting the requested dataset values from each Parameters ---------- col : string Name of the dataset to be returned Returns ------- numpy array Values from the dataset, filtered if requested and concatenated in order of file list """ logging.info('getting %s' % col) vals = [] for f in self.files: d = FileData(f, group=self.group, columnlist=self.columns, filter_func=self.filter_func) vals.append(d.get_column(col)) # Close each file since h5py has an upper limit on the number of # open file objects (approx. 1000) d.close() logging.info('- got %i values' % sum(len(v) for v in vals)) return np.concatenate(vals)
[docs]class SingleDetTriggers(object): """ Provides easy access to the parameters of single-detector CBC triggers. """ # FIXME: Some of these are optional and should be kwargs. def __init__(self, trig_file, bank_file, veto_file, segment_name, filter_func, detector, premask=None): logging.info('Loading triggers') self.trigs_f = HFile(trig_file, 'r') self.trigs = self.trigs_f[detector] self.ifo = detector # convenience attributes self.detector = detector if bank_file: logging.info('Loading bank') self.bank = HFile(bank_file, 'r') else: logging.info('No bank file given') # empty dict in place of non-existent hdf file self.bank = {} if premask is not None: self.mask = premask else: self.mask = np.ones(len(self.trigs['end_time']), dtype=bool) if veto_file: logging.info('Applying veto segments') # veto_mask is an array of indices into the trigger arrays # giving the surviving triggers logging.info('%i triggers before vetoes', self.mask.sum()) self.veto_mask, _ = events.veto.indices_outside_segments( self.end_time, [veto_file], ifo=detector, segment_name=segment_name) idx = np.flatnonzero(self.mask)[self.veto_mask] self.mask[:] = False self.mask[idx] = True logging.info('%i triggers remain after vetoes', len(self.veto_mask)) # FIXME this should use the hfile select interface to avoid # memory and processing limitations. if filter_func: # get required columns into the namespace with dummy attribute # names to avoid confusion with other class properties logging.info('Setting up filter function') for c in self.trigs.keys(): if c in filter_func: setattr(self, '_'+c, self.trigs[c][:]) for c in self.bank.keys(): if c in filter_func: # get template parameters corresponding to triggers setattr(self, '_'+c, np.array(self.bank[c])[self.trigs['template_id'][:]]) self.filter_mask = eval(filter_func.replace('self.', 'self._')) # remove the dummy attributes for c in chain(self.trigs.keys(), self.bank.keys()): if c in filter_func: delattr(self, '_'+c) self.mask = self.mask & self.filter_mask logging.info('%i triggers remain after cut on %s', sum(self.mask), filter_func) def __getitem__(self, key): # Is key in the TRIGGER_MERGE file? try: return self.get_column(key) except KeyError: pass # Is key in the bank file? try: self.checkbank(key) return self.bank[key][:][self.template_id] except (RuntimeError, KeyError) as exc: err_msg = "Cannot find {} in input files".format(key) raise ValueError(err_msg) from exc
[docs] def checkbank(self, param): if self.bank == {}: return RuntimeError("Can't get %s values without a bank file" % param)
[docs] def trig_dict(self): """Returns dict of the masked trigger valuse """ mtrigs = {} for k in self.trigs: if len(self.trigs[k]) == len(self.trigs['end_time']): if self.mask is not None: mtrigs[k] = self.trigs[k][self.mask] else: mtrigs[k] = self.trigs[k][:] return mtrigs
[docs] @classmethod def get_param_names(cls): """Returns a list of plottable CBC parameter variables""" return [m[0] for m in inspect.getmembers(cls) \ if type(m[1]) == property]
[docs] def apply_mask(self, logic_mask): """Apply a boolean array to the set of triggers""" if hasattr(self.mask, 'dtype') and (self.mask.dtype == 'bool'): orig_indices = self.mask.nonzero()[0][logic_mask] self.mask[:] = False self.mask[orig_indices] = True else: self.mask = list(np.array(self.mask)[logic_mask])
[docs] def mask_to_n_loudest_clustered_events(self, rank_method, n_loudest=10, cluster_window=10): """Edits the mask property of the class to point to the N loudest single detector events as ranked by ranking statistic. Events are clustered so that no more than 1 event within +/- cluster-window will be considered.""" # If this becomes memory intensive we can optimize stat = rank_method.rank_stat_single((self.ifo, self.trig_dict())) if len(stat) == 0: # No triggers, so just return here self.stat = np.array([]) return times = self.end_time index = stat.argsort()[::-1] new_times = [] new_index = [] for curr_idx in index: curr_time = times[curr_idx] for time in new_times: if abs(curr_time - time) < cluster_window: break else: # Only get here if no other triggers within cluster window new_index.append(curr_idx) new_times.append(curr_time) if len(new_index) >= n_loudest: break index = np.array(new_index) index.sort() self.stat = stat[index] if hasattr(self.mask, 'dtype') and self.mask.dtype == 'bool': orig_indices = np.flatnonzero(self.mask)[index] self.mask = list(orig_indices) elif isinstance(self.mask, list): self.mask = list(np.array(self.mask)[index])
@property def template_id(self): return self.get_column('template_id') @property def mass1(self): self.checkbank('mass1') return self.bank['mass1'][:][self.template_id] @property def mass2(self): self.checkbank('mass2') return self.bank['mass2'][:][self.template_id] @property def spin1z(self): self.checkbank('spin1z') return self.bank['spin1z'][:][self.template_id] @property def spin2z(self): self.checkbank('spin2z') return self.bank['spin2z'][:][self.template_id] @property def spin2x(self): self.checkbank('spin2x') return self.bank['spin2x'][:][self.template_id] @property def spin2y(self): self.checkbank('spin2y') return self.bank['spin2y'][:][self.template_id] @property def spin1x(self): self.checkbank('spin1x') return self.bank['spin1x'][:][self.template_id] @property def spin1y(self): self.checkbank('spin1y') return self.bank['spin1y'][:][self.template_id] @property def inclination(self): self.checkbank('inclination') return self.bank['inclination'][:][self.template_id] @property def f_lower(self): self.checkbank('f_lower') return self.bank['f_lower'][:][self.template_id] @property def approximant(self): self.checkbank('approximant') return self.bank['approximant'][:][self.template_id] @property def mtotal(self): return self.mass1 + self.mass2 @property def mchirp(self): return conversions.mchirp_from_mass1_mass2(self.mass1, self.mass2) @property def eta(self): return conversions.eta_from_mass1_mass2(self.mass1, self.mass2) @property def effective_spin(self): # FIXME assumes aligned spins return conversions.chi_eff(self.mass1, self.mass2, self.spin1z, self.spin2z) # IMPROVEME: would like to have a way to access all get_freq and/or # other pnutils.* names rather than hard-coding each one # - eg make this part of a fancy interface to the bank file ? @property def f_seobnrv2_peak(self): return pnutils.get_freq('fSEOBNRv2Peak', self.mass1, self.mass2, self.spin1z, self.spin2z) @property def f_seobnrv4_peak(self): return pnutils.get_freq('fSEOBNRv4Peak', self.mass1, self.mass2, self.spin1z, self.spin2z) @property def end_time(self): return self.get_column('end_time') @property def template_duration(self): return self.get_column('template_duration') @property def snr(self): return self.get_column('snr') @property def sgchisq(self): return self.get_column('sg_chisq') @property def u_vals(self): return self.get_column('u_vals') @property def rchisq(self): return self.get_column('chisq') \ / (self.get_column('chisq_dof') * 2 - 2) @property def psd_var_val(self): return self.get_column('psd_var_val') @property def newsnr(self): return ranking.newsnr(self.snr, self.rchisq) @property def newsnr_sgveto(self): return ranking.newsnr_sgveto(self.snr, self.rchisq, self.sgchisq) @property def newsnr_sgveto_psdvar(self): return ranking.newsnr_sgveto_psdvar(self.snr, self.rchisq, self.sgchisq, self.psd_var_val) @property def newsnr_sgveto_psdvar_threshold(self): return ranking.newsnr_sgveto_psdvar_threshold(self.snr, self.rchisq, self.sgchisq, self.psd_var_val)
[docs] def get_ranking(self, rank_name, **kwargs): return ranking.get_sngls_ranking_from_trigs(self, rank_name, **kwargs)
[docs] def get_column(self, cname): # Fiducial value that seems to work, not extensively tuned. MFRAC = 0.3 # If the mask accesses few enough elements then directly use it # This can be slower than reading in all the elements if most of them # will be read. if self.mask is not None and (isinstance(self.mask, list) or \ (len(self.mask.nonzero()[0]) < (len(self.mask) * MFRAC))): return self.trigs[cname][self.mask] # We have a lot of elements to read so we resort to readin the entire # array before masking. elif self.mask is not None: return self.trigs[cname][:][self.mask] else: return self.trigs[cname][:]
[docs]class ForegroundTriggers(object): # FIXME: A lot of this is hardcoded to expect two ifos def __init__(self, coinc_file, bank_file, sngl_files=None, n_loudest=None, group='foreground'): self.coinc_file = FileData(coinc_file, group=group) if 'ifos' in self.coinc_file.h5file.attrs: self.ifos = self.coinc_file.h5file.attrs['ifos'].split(' ') else: self.ifos = [self.coinc_file.h5file.attrs['detector_1'], self.coinc_file.h5file.attrs['detector_2']] self.sngl_files = {} if sngl_files is not None: for sngl_file in sngl_files: curr_dat = FileData(sngl_file) curr_ifo = curr_dat.group_key self.sngl_files[curr_ifo] = curr_dat if not all([ifo in self.sngl_files.keys() for ifo in self.ifos]): print("sngl_files: {}".format(sngl_files)) print("self.ifos: {}".format(self.ifos)) raise RuntimeError("IFOs in statmap file not all represented " "by single-detector trigger files.") if not sorted(self.sngl_files.keys()) == sorted(self.ifos): logging.warning("WARNING: Single-detector trigger files " "given for IFOs not in the statmap file") self.bank_file = HFile(bank_file, "r") self.n_loudest = n_loudest self._sort_arr = None self._template_id = None self._trig_ids = None self.get_active_segments() @property def sort_arr(self): if self._sort_arr is None: ifar = self.coinc_file.get_column('ifar') sorting = ifar.argsort()[::-1] if self.n_loudest: sorting = sorting[:self.n_loudest] self._sort_arr = sorting return self._sort_arr @property def template_id(self): if self._template_id is None: template_id = self.get_coincfile_array('template_id') self._template_id = template_id return self._template_id @property def trig_id(self): if self._trig_ids is not None: return self._trig_ids self._trig_ids = {} ifos = self.coinc_file.h5file.attrs['ifos'].split(' ') for ifo in ifos: trigid = self.get_coincfile_array(ifo + '/trigger_id') self._trig_ids[ifo] = trigid return self._trig_ids
[docs] def get_coincfile_array(self, variable): return self.coinc_file.get_column(variable)[self.sort_arr]
[docs] def get_bankfile_array(self, variable): try: return self.bank_file[variable][:][self.template_id] except IndexError: if len(self.template_id) == 0: return np.array([]) raise
[docs] def get_snglfile_array_dict(self, variable): return_dict = {} for ifo in self.ifos: try: tid = self.trig_id[ifo] lgc = tid == -1 # Put in *some* value for the invalid points to avoid failure # Make sure this doesn't change the cached internal array! tid = np.copy(tid) tid[lgc] = 0 # If small number of points don't read the full file if len(tid) < 1000: curr = [] hdf_dataset = self.sngl_files[ifo].group[variable] for idx in tid: curr.append(hdf_dataset[idx]) curr = np.array(curr) else: curr = self.sngl_files[ifo].get_column(variable)[tid] except IndexError: if len(self.trig_id[ifo]) == 0: curr = np.array([]) lgc = curr == 0 else: raise return_dict[ifo] = (curr, np.logical_not(lgc)) return return_dict
[docs] def get_active_segments(self): self.active_segments = {} for ifo in self.ifos: starts = self.sngl_files[ifo].get_column('search/start_time') ends = self.sngl_files[ifo].get_column('search/end_time') self.active_segments[ifo] = veto.start_end_to_segments(starts, ends)
[docs] def get_end_time(self): ifos = self.coinc_file.h5file.attrs['ifos'].split(' ') times_gen = (self.get_coincfile_array('{}/time'.format(ifo)) for ifo in ifos) ref_times = np.array([mean_if_greater_than_zero(t)[0] for t in zip(*times_gen)]) return ref_times
[docs] def get_ifos(self): ifo_list = [] n_trigs = len(self.get_coincfile_array('template_id')) try: # First try new-style format ifos = self.coinc_file.h5file.attrs['ifos'].split(' ') for ifo in ifos: ifo_trigs = np.where(self.get_coincfile_array(ifo+'/time') < 0, '-', ifo) ifo_list.append(ifo_trigs) ifo_list = [list(trig[trig != '-']) \ for trig in iter(np.array(ifo_list).T)] except KeyError: # Else fall back on old two-det format # Currently assumes two-det is Hanford and Livingston ifo_list = [['H1', 'L1'] for i in range(n_trigs)] return ifo_list
[docs] def to_coinc_xml_object(self, file_name): outdoc = ligolw.Document() outdoc.appendChild(ligolw.LIGO_LW()) ifos = list(self.sngl_files.keys()) proc_id = ligolw_process.register_to_xmldoc(outdoc, 'pycbc', {}, instruments=ifos, comment='', version=pycbc_version.git_hash, cvs_repository='pycbc/'+pycbc_version.git_branch, cvs_entry_time=pycbc_version.date).process_id search_summ_table = lsctables.New(lsctables.SearchSummaryTable) coinc_h5file = self.coinc_file.h5file try: start_time = coinc_h5file['segments']['coinc']['start'][:].min() end_time = coinc_h5file['segments']['coinc']['end'][:].max() except KeyError: start_times = [] end_times = [] for ifo_comb in coinc_h5file['segments']: if ifo_comb == 'foreground_veto': continue seg_group = coinc_h5file['segments'][ifo_comb] start_times.append(seg_group['start'][:].min()) end_times.append(seg_group['end'][:].max()) start_time = min(start_times) end_time = max(end_times) num_trigs = len(self.sort_arr) search_summary = return_search_summary(start_time, end_time, num_trigs, ifos) search_summ_table.append(search_summary) outdoc.childNodes[0].appendChild(search_summ_table) sngl_inspiral_table = lsctables.New(lsctables.SnglInspiralTable) coinc_def_table = lsctables.New(lsctables.CoincDefTable) coinc_event_table = lsctables.New(lsctables.CoincTable) coinc_inspiral_table = lsctables.New(lsctables.CoincInspiralTable) coinc_event_map_table = lsctables.New(lsctables.CoincMapTable) time_slide_table = lsctables.New(lsctables.TimeSlideTable) # Set up time_slide table time_slide_id = lsctables.TimeSlideID(0) for ifo in ifos: time_slide_row = lsctables.TimeSlide() time_slide_row.instrument = ifo time_slide_row.time_slide_id = time_slide_id time_slide_row.offset = 0 time_slide_row.process_id = proc_id time_slide_table.append(time_slide_row) # Set up coinc_definer table coinc_def_id = lsctables.CoincDefID(0) coinc_def_row = lsctables.CoincDef() coinc_def_row.search = "inspiral" coinc_def_row.description = \ "sngl_inspiral<-->sngl_inspiral coincidences" coinc_def_row.coinc_def_id = coinc_def_id coinc_def_row.search_coinc_type = 0 coinc_def_table.append(coinc_def_row) bank_col_names = ['mass1', 'mass2', 'spin1z', 'spin2z'] bank_col_vals = {} for name in bank_col_names: bank_col_vals[name] = self.get_bankfile_array(name) coinc_event_names = ['ifar', 'time', 'fap', 'stat'] coinc_event_vals = {} for name in coinc_event_names: if name == 'time': coinc_event_vals[name] = self.get_end_time() else: coinc_event_vals[name] = self.get_coincfile_array(name) sngl_col_names = ['snr', 'chisq', 'chisq_dof', 'bank_chisq', 'bank_chisq_dof', 'cont_chisq', 'cont_chisq_dof', 'end_time', 'template_duration', 'coa_phase', 'sigmasq'] sngl_col_vals = {} for name in sngl_col_names: sngl_col_vals[name] = self.get_snglfile_array_dict(name) sngl_event_count = 0 for idx in range(len(self.sort_arr)): # Set up IDs and mapping values coinc_id = lsctables.CoincID(idx) # Set up sngls sngl_mchirps = [] sngl_mtots = [] net_snrsq = 0 triggered_ifos = [] for ifo in ifos: # If this ifo is not participating in this coincidence then # ignore it and move on. if not sngl_col_vals['snr'][ifo][1][idx]: continue triggered_ifos += [ifo] event_id = lsctables.SnglInspiralID(sngl_event_count) sngl_event_count += 1 sngl = return_empty_sngl() sngl.event_id = event_id sngl.ifo = ifo net_snrsq += sngl_col_vals['snr'][ifo][0][idx]**2 for name in sngl_col_names: val = sngl_col_vals[name][ifo][0][idx] if name == 'end_time': sngl.end = LIGOTimeGPS(val) else: setattr(sngl, name, val) for name in bank_col_names: val = bank_col_vals[name][idx] setattr(sngl, name, val) sngl.mtotal, sngl.eta = pnutils.mass1_mass2_to_mtotal_eta( sngl.mass1, sngl.mass2) sngl.mchirp, _ = pnutils.mass1_mass2_to_mchirp_eta( sngl.mass1, sngl.mass2) sngl.eff_distance = (sngl.sigmasq)**0.5 / sngl.snr # If exact match is not used, then take mean # masses from the single triggers sngl_mchirps += [sngl.mchirp] sngl_mtots += [sngl.mtotal] sngl_inspiral_table.append(sngl) # Set up coinc_map entry coinc_map_row = lsctables.CoincMap() coinc_map_row.table_name = 'sngl_inspiral' coinc_map_row.coinc_event_id = coinc_id coinc_map_row.event_id = event_id coinc_event_map_table.append(coinc_map_row) sngl_combined_mchirp = np.mean(sngl_mchirps) sngl_combined_mtot = np.mean(sngl_mtots) # Set up coinc inspiral and coinc event tables coinc_event_row = lsctables.Coinc() coinc_inspiral_row = lsctables.CoincInspiral() coinc_event_row.coinc_def_id = coinc_def_id coinc_event_row.nevents = len(triggered_ifos) # note that simply `coinc_event_row.instruments = triggered_ifos` # does not lead to a correct result with ligo.lw 1.7.1 coinc_event_row.instruments = ','.join(sorted(triggered_ifos)) coinc_inspiral_row.instruments = triggered_ifos coinc_event_row.time_slide_id = time_slide_id coinc_event_row.process_id = proc_id coinc_event_row.coinc_event_id = coinc_id coinc_inspiral_row.coinc_event_id = coinc_id coinc_inspiral_row.mchirp = sngl_combined_mchirp coinc_inspiral_row.mass = sngl_combined_mtot coinc_inspiral_row.end = LIGOTimeGPS(coinc_event_vals['time'][idx]) coinc_inspiral_row.snr = net_snrsq**0.5 coinc_inspiral_row.false_alarm_rate = coinc_event_vals['fap'][idx] coinc_inspiral_row.combined_far = 1./coinc_event_vals['ifar'][idx] # Transform to Hz coinc_inspiral_row.combined_far = \ coinc_inspiral_row.combined_far / YRJUL_SI coinc_event_row.likelihood = coinc_event_vals['stat'][idx] coinc_inspiral_row.minimum_duration = 0. coinc_event_table.append(coinc_event_row) coinc_inspiral_table.append(coinc_inspiral_row) outdoc.childNodes[0].appendChild(coinc_def_table) outdoc.childNodes[0].appendChild(coinc_event_table) outdoc.childNodes[0].appendChild(coinc_event_map_table) outdoc.childNodes[0].appendChild(time_slide_table) outdoc.childNodes[0].appendChild(coinc_inspiral_table) outdoc.childNodes[0].appendChild(sngl_inspiral_table) ligolw_utils.write_filename(outdoc, file_name)
[docs] def to_coinc_hdf_object(self, file_name): ofd = h5py.File(file_name,'w') # Some fields are special cases: logging.info("Outputting search results") time = self.get_end_time() ofd.create_dataset('time', data=time, dtype=np.float32) ifar = self.get_coincfile_array('ifar') ofd.create_dataset('ifar', data=ifar, dtype=np.float32) ifar_exc = self.get_coincfile_array('ifar_exc') ofd.create_dataset('ifar_exclusive', data=ifar_exc, dtype=np.float32) fap = self.get_coincfile_array('fap') ofd.create_dataset('p_value', data=fap, dtype=np.float32) fap_exc = self.get_coincfile_array('fap_exc') ofd.create_dataset('p_value_exclusive', data=fap_exc, dtype=np.float32) # Coinc fields for field in ['stat']: vals = self.get_coincfile_array(field) ofd.create_dataset(field, data=vals, dtype=np.float32) logging.info("Outputting template information") # Bank fields for field in ['mass1','mass2','spin1z','spin2z']: vals = self.get_bankfile_array(field) ofd.create_dataset(field, data=vals, dtype=np.float32) mass1 = self.get_bankfile_array('mass1') mass2 = self.get_bankfile_array('mass2') mchirp, _ = mass1_mass2_to_mchirp_eta(mass1, mass2) ofd.create_dataset('chirp_mass', data=mchirp, dtype=np.float32) logging.info("Outputting single-trigger information") logging.info("reduced chisquared") chisq_vals_valid = self.get_snglfile_array_dict('chisq') chisq_dof_vals_valid = self.get_snglfile_array_dict('chisq_dof') for ifo in self.ifos: chisq_vals = chisq_vals_valid[ifo][0] chisq_valid = chisq_vals_valid[ifo][1] chisq_dof_vals = chisq_dof_vals_valid[ifo][0] rchisq = chisq_vals / (2. * chisq_dof_vals - 2.) rchisq[np.logical_not(chisq_valid)] = -1. ofd.create_dataset(ifo + '_chisq', data=rchisq, dtype=np.float64) # Single-detector fields for field in ['sg_chisq', 'end_time', 'sigmasq', 'psd_var_val']: logging.info(field) try: vals_valid = self.get_snglfile_array_dict(field) except KeyError: logging.info(field + " is not present in the " "single-detector files") for ifo in self.ifos: vals = vals_valid[ifo][0] valid = vals_valid[ifo][1] vals[np.logical_not(valid)] = -1. ofd.create_dataset(ifo + '_'+field, data=vals, dtype=np.float32) snr_vals_valid = self.get_snglfile_array_dict('snr') network_snr_sq = np.zeros_like(snr_vals_valid[self.ifos[0]][0]) for ifo in self.ifos: vals = snr_vals_valid[ifo][0] valid = snr_vals_valid[ifo][1] vals[np.logical_not(valid)] = -1. ofd.create_dataset(ifo + '_snr', data=vals, dtype=np.float32) network_snr_sq[valid] += vals[valid] ** 2.0 network_snr = np.sqrt(network_snr_sq) ofd.create_dataset('network_snr', data=network_snr, dtype=np.float32) logging.info("Triggered detectors") # This creates a n_ifos by n_events matrix, with the ifo letter # if the event contains a trigger from the ifo, empty string if not triggered_matrix = [[ifo[0] if v else '' for v in snr_vals_valid[ifo][1]] for ifo in self.ifos] # This combines the ifo letters to make a single string per event triggered_detectors = [''.join(triggered).encode('ascii') for triggered in zip(*triggered_matrix)] ofd.create_dataset('trig', data=triggered_detectors, dtype='<S3') logging.info("active detectors") # This creates a n_ifos by n_events matrix, with the ifo letter # if the ifo was active at the event time, empty string if not active_matrix = [[ifo[0] if t in self.active_segments[ifo] else '' for t in time] for ifo in self.ifos] # This combines the ifo letters to make a single string per event active_detectors = [''.join(active_at_time).encode('ascii') for active_at_time in zip(*active_matrix)] ofd.create_dataset('obs', data=active_detectors, dtype='<S3') ofd.close()
[docs]class ReadByTemplate(object): # default assignment to {} is OK for a variable used only in __init__ def __init__(self, filename, bank=None, segment_name=None, veto_files=None, gating_veto_windows={}): self.filename = filename self.file = h5py.File(filename, 'r') self.ifo = tuple(self.file.keys())[0] self.valid = None self.bank = h5py.File(bank, 'r') if bank else {} # Determine the segments which define the boundaries of valid times # to use triggers key = '%s/search/' % self.ifo s, e = self.file[key + 'start_time'][:], self.file[key + 'end_time'][:] self.segs = veto.start_end_to_segments(s, e).coalesce() if segment_name is None: segment_name = [] if veto_files is None: veto_files = [] for vfile, name in zip(veto_files, segment_name): veto_segs = veto.select_segments_by_definer(vfile, ifo=self.ifo, segment_name=name) self.segs = (self.segs - veto_segs).coalesce() if self.ifo in gating_veto_windows: gating_veto = gating_veto_windows[self.ifo].split(',') gveto_before = float(gating_veto[0]) gveto_after = float(gating_veto[1]) if gveto_before > 0 or gveto_after < 0: raise ValueError("Gating veto window values must be negative " "before gates and positive after gates.") if not (gveto_before == 0 and gveto_after == 0): gate_times = np.unique( self.file[self.ifo + '/gating/auto/time'][:]) gating_veto_segs = veto.start_end_to_segments(gate_times + gveto_before, gate_times + gveto_after).coalesce() self.segs = (self.segs - gating_veto_segs).coalesce() self.valid = veto.segments_to_start_end(self.segs)
[docs] def get_data(self, col, num): """Get a column of data for template with id 'num'. Parameters ---------- col: str Name of column to read num: int The template id to read triggers for Returns ------- data: numpy.ndarray The requested column of data """ ref = self.file['%s/%s_template' % (self.ifo, col)][num] return self.file['%s/%s' % (self.ifo, col)][ref]
[docs] def set_template(self, num): """Set the active template to read from. Parameters ---------- num: int The template id to read triggers for. Returns ------- trigger_id: numpy.ndarray The indices of this templates triggers. """ self.template_num = num times = self.get_data('end_time', num) # Determine which of these template's triggers are kept after # applying vetoes if self.valid: self.keep = veto.indices_within_times(times, self.valid[0], self.valid[1]) # logging.info('applying vetoes') else: self.keep = np.arange(0, len(times)) if self.bank != {}: self.param = {} if 'parameters' in self.bank.attrs: for col in self.bank.attrs['parameters']: self.param[col] = self.bank[col][self.template_num] else: for col in self.bank: self.param[col] = self.bank[col][self.template_num] # Calculate the trigger id by adding the relative offset in self.keep # to the absolute beginning index of this templates triggers stored # in 'template_boundaries' trigger_id = self.keep + \ self.file['%s/template_boundaries' % self.ifo][num] return trigger_id
def __getitem__(self, col): """ Return the column of data for current active template after applying vetoes Parameters ---------- col: str Name of column to read Returns ------- data: numpy.ndarray The requested column of data """ if self.template_num is None: raise ValueError('You must call set_template to first pick the ' 'template to read data from') data = self.get_data(col, self.template_num) data = data[self.keep] if self.valid else data return data
chisq_choices = ['traditional', 'cont', 'bank', 'max_cont_trad', 'sg', 'max_bank_cont', 'max_bank_trad', 'max_bank_cont_trad']
[docs]def get_chisq_from_file_choice(hdfile, chisq_choice): f = hdfile if chisq_choice in ['traditional','max_cont_trad', 'max_bank_trad', 'max_bank_cont_trad']: trad_chisq = f['chisq'][:] # We now need to handle the case where chisq is not actually calculated # 0 is used as a sentinel value trad_chisq_dof = f['chisq_dof'][:] trad_chisq /= (trad_chisq_dof * 2 - 2) if chisq_choice in ['cont', 'max_cont_trad', 'max_bank_cont', 'max_bank_cont_trad']: cont_chisq = f['cont_chisq'][:] cont_chisq_dof = f['cont_chisq_dof'][:] cont_chisq /= cont_chisq_dof if chisq_choice in ['bank', 'max_bank_cont', 'max_bank_trad', 'max_bank_cont_trad']: bank_chisq = f['bank_chisq'][:] bank_chisq_dof = f['bank_chisq_dof'][:] bank_chisq /= bank_chisq_dof if chisq_choice == 'sg': chisq = f['sg_chisq'][:] elif chisq_choice == 'traditional': chisq = trad_chisq elif chisq_choice == 'cont': chisq = cont_chisq elif chisq_choice == 'bank': chisq = bank_chisq elif chisq_choice == 'max_cont_trad': chisq = np.maximum(trad_chisq, cont_chisq) elif chisq_choice == 'max_bank_cont': chisq = np.maximum(bank_chisq, cont_chisq) elif chisq_choice == 'max_bank_trad': chisq = np.maximum(bank_chisq, trad_chisq) elif chisq_choice == 'max_bank_cont_trad': chisq = np.maximum(np.maximum(bank_chisq, cont_chisq), trad_chisq) else: err_msg = "Do not recognize --chisq-choice %s" % chisq_choice raise ValueError(err_msg) return chisq
[docs]def save_dict_to_hdf5(dic, filename): """ Parameters ---------- dic: python dictionary to be converted to hdf5 format filename: desired name of hdf5 file """ with h5py.File(filename, 'w') as h5file: recursively_save_dict_contents_to_group(h5file, '/', dic)
[docs]def recursively_save_dict_contents_to_group(h5file, path, dic): """ Parameters ---------- h5file: h5py file to be written to path: path within h5py file to saved dictionary dic: python dictionary to be converted to hdf5 format """ for key, item in dic.items(): if isinstance(item, (np.ndarray, np.int64, np.float64, str, int, float, bytes, tuple, list)): h5file[path + str(key)] = item elif isinstance(item, dict): recursively_save_dict_contents_to_group(h5file, path + key + '/', item) else: raise ValueError('Cannot save %s type' % type(item))
[docs]def load_hdf5_to_dict(h5file, path): """ Parameters ---------- h5file: h5py file to be loaded as a dictionary path: path within h5py file to load: '/' for the whole h5py file Returns ------- dic: dictionary with hdf5 file group content """ dic = {} for key, item in h5file[path].items(): if isinstance(item, h5py.Dataset): dic[key] = item[()] elif isinstance(item, h5py.Group): dic[key] = load_hdf5_to_dict(h5file, path + key + '/') else: raise ValueError('Cannot load %s type' % type(item)) return dic
[docs]def combine_and_copy(f, files, group): """ Combine the same column from multiple files and save to a third""" # ensure that the files input is stable for iteration order assert isinstance(files, (list, tuple)) f[group] = np.concatenate([fi[group][:] if group in fi else \ np.array([], dtype=np.uint32) for fi in files])
[docs]def name_all_datasets(files): assert isinstance(files, (list, tuple)) datasets = [] for fi in files: datasets += get_all_subkeys(fi, '/') return set(datasets)
[docs]def get_all_subkeys(grp, key): subkey_list = [] subkey_start = key if key == '': grpk = grp else: grpk = grp[key] for sk in grpk.keys(): path = subkey_start + '/' + sk if isinstance(grp[path], h5py.Dataset): subkey_list.append(path.lstrip('/')) else: subkey_list += get_all_subkeys(grp, path) # returns an empty list if there is no dataset or subgroup within the group return subkey_list
# # ============================================================================= # # Checkpointing utilities # # ============================================================================= #
[docs]def dump_state(state, fp, path=None, dsetname='state', protocol=pickle.HIGHEST_PROTOCOL): """Dumps the given state to an hdf5 file handler. The state is stored as a raw binary array to ``{path}/{dsetname}`` in the given hdf5 file handler. If a dataset with the same name and path is already in the file, the dataset will be resized and overwritten with the new state data. Parameters ---------- state : any picklable object The sampler state to dump to file. Can be the object returned by any of the samplers' `.state` attribute (a dictionary of dictionaries), or any picklable object. fp : h5py.File An open hdf5 file handler. Must have write capability enabled. path : str, optional The path (group name) to store the state dataset to. Default (None) will result in the array being stored to the top level. dsetname : str, optional The name of the dataset to store the binary array to. Default is ``state``. protocol : int, optional The protocol version to use for pickling. See the :py:mod:`pickle` module for more details. """ memfp = BytesIO() pickle.dump(state, memfp, protocol=protocol) dump_pickle_to_hdf(memfp, fp, path=path, dsetname=dsetname)
[docs]def dump_pickle_to_hdf(memfp, fp, path=None, dsetname='state'): """Dumps pickled data to an hdf5 file object. Parameters ---------- memfp : file object Bytes stream of pickled data. fp : h5py.File An open hdf5 file handler. Must have write capability enabled. path : str, optional The path (group name) to store the state dataset to. Default (None) will result in the array being stored to the top level. dsetname : str, optional The name of the dataset to store the binary array to. Default is ``state``. """ memfp.seek(0) bdata = np.frombuffer(memfp.read(), dtype='S1') if path is not None: dsetname = path + '/' + dsetname if dsetname not in fp: fp.create_dataset(dsetname, shape=bdata.shape, maxshape=(None,), dtype=bdata.dtype) elif bdata.size != fp[dsetname].shape[0]: fp[dsetname].resize((bdata.size,)) fp[dsetname][:] = bdata
[docs]def load_state(fp, path=None, dsetname='state'): """Loads a sampler state from the given hdf5 file object. The sampler state is expected to be stored as a raw bytes array which can be loaded by pickle. Parameters ---------- fp : h5py.File An open hdf5 file handler. path : str, optional The path (group name) that the state data is stored to. Default (None) is to read from the top level. dsetname : str, optional The name of the dataset that the state data is stored to. Default is ``state``. """ if path is not None: fp = fp[path] bdata = fp[dsetname][()].tobytes() return pickle.load(BytesIO(bdata))
__all__ = ('HFile', 'DictArray', 'StatmapData', 'MultiifoStatmapData', 'FileData', 'DataFromFiles', 'SingleDetTriggers', 'ForegroundTriggers', 'ReadByTemplate', 'chisq_choices', 'get_chisq_from_file_choice', 'save_dict_to_hdf5', 'recursively_save_dict_contents_to_group', 'load_hdf5_to_dict', 'combine_and_copy', 'name_all_datasets', 'get_all_subkeys', 'dump_state', 'dump_pickle_to_hdf', 'load_state')