# convenience classes for accessing hdf5 trigger files
# the 'get_column()' method is implemented parallel to
# the existing pylal.SnglInspiralUtils functions
import h5py
import numpy as np
import logging
import inspect
import pickle
from itertools import chain
from io import BytesIO
from lal import LIGOTimeGPS, YRJUL_SI
from ligo.lw import ligolw
from ligo.lw import lsctables
from ligo.lw import utils as ligolw_utils
from ligo.lw.utils import process as ligolw_process
from pycbc import version as pycbc_version
from pycbc.io.ligolw import return_search_summary, return_empty_sngl
from pycbc import events, conversions, pnutils
from pycbc.events import ranking, veto
from pycbc.events import mean_if_greater_than_zero
from pycbc.pnutils import mass1_mass2_to_mchirp_eta
[docs]class HFile(h5py.File):
""" Low level extensions to the capabilities of reading an hdf5 File
"""
[docs] def select(self, fcn, *args, **kwds):
""" Return arrays from an hdf5 file that satisfy the given function
Parameters
----------
fcn : a function
A function that accepts the same number of argument as keys given
and returns a boolean array of the same length.
args : strings
A variable number of strings that are keys into the hdf5. These must
refer to arrays of equal length.
chunksize : {1e6, int}, optional
Number of elements to read and process at a time.
return_indices : bool, optional
If True, also return the indices of elements passing the function.
Returns
-------
values : np.ndarrays
A variable number of arrays depending on the number of keys into
the hdf5 file that are given. If return_indices is True, the first
element is an array of indices of elements passing the function.
>>> f = HFile(filename)
>>> snr = f.select(lambda snr: snr > 6, 'H1/snr')
"""
# get references to each array
refs = {}
data = {}
for arg in args:
refs[arg] = self[arg]
data[arg] = []
return_indices = kwds.get('return_indices', False)
indices = np.array([], dtype=np.uint64)
# To conserve memory read the array in chunks
chunksize = kwds.get('chunksize', int(1e6))
size = len(refs[arg])
i = 0
while i < size:
r = i + chunksize if i + chunksize < size else size
#Read each chunks worth of data and find where it passes the function
partial = [refs[arg][i:r] for arg in args]
keep = fcn(*partial)
if return_indices:
indices = np.concatenate([indices, np.flatnonzero(keep) + i])
#store only the results that pass the function
for arg, part in zip(args, partial):
data[arg].append(part[keep])
i += chunksize
# Combine the partial results into full arrays
if len(args) == 1:
res = np.concatenate(data[args[0]])
if return_indices:
return indices.astype(np.uint64), res
else:
return res
else:
res = tuple(np.concatenate(data[arg]) for arg in args)
if return_indices:
return (indices.astype(np.uint64),) + res
else:
return res
[docs]class DictArray(object):
""" Utility for organizing sets of arrays of equal length.
Manages a dictionary of arrays of equal length. This can also
be instantiated with a set of hdf5 files and the key values. The full
data is always in memory and all operations create new instances of the
DictArray.
"""
def __init__(self, data=None, files=None, groups=None):
""" Create a DictArray
Parameters
----------
data: dict, optional
Dictionary of equal length numpy arrays
files: list of filenames, optional
List of hdf5 file filenames. Incompatibile with the `data` option.
groups: list of strings
List of keys into each file. Required by the files option.
"""
# Check that input fits with how the DictArray is set up
if data and files:
raise RuntimeError('DictArray can only have data or files as '
'input, not both.')
if files and not groups:
raise RuntimeError('If files are given then need groups.')
self.data = data
self.groups = groups
if files:
self.data = {}
for g in groups:
self.data[g] = []
for f in files:
d = HFile(f)
for g in groups:
if g in d:
self.data[g].append(d[g][:])
d.close()
for k in self.data:
if not len(self.data[k]) == 0:
self.data[k] = np.concatenate(self.data[k])
for k in self.data:
setattr(self, k, self.data[k])
def _return(self, data):
return self.__class__(data=data)
def __len__(self):
return len(self.data[tuple(self.data.keys())[0]])
def __add__(self, other):
data = {}
for k in self.data:
try:
data[k] = np.concatenate([self.data[k], other.data[k]])
except KeyError:
logging.info('%s does not exist in other data' % k)
return self._return(data=data)
[docs] def select(self, idx):
""" Return a new DictArray containing only the indexed values
"""
data = {}
for k in self.data:
data[k] = self.data[k][idx]
return self._return(data=data)
[docs] def remove(self, idx):
""" Return a new DictArray that does not contain the indexed values
"""
data = {}
for k in self.data:
data[k] = np.delete(self.data[k], np.array(idx, dtype=int))
return self._return(data=data)
[docs] def save(self, outname):
f = HFile(outname, "w")
for k in self.attrs:
f.attrs[k] = self.attrs[k]
for k in self.data:
f.create_dataset(k, data=self.data[k],
compression='gzip',
compression_opts=9,
shuffle=True)
f.close()
[docs]class StatmapData(DictArray):
def __init__(self, data=None, seg=None, attrs=None, files=None,
groups=('stat', 'time1', 'time2', 'trigger_id1',
'trigger_id2', 'template_id', 'decimation_factor',
'timeslide_id')):
super(StatmapData, self).__init__(data=data, files=files,
groups=groups)
if data:
self.seg=seg
self.attrs=attrs
elif files:
f = HFile(files[0], "r")
self.seg = f['segments']
self.attrs = f.attrs
def _return(self, data):
return self.__class__(data=data, attrs=self.attrs, seg=self.seg)
[docs] def cluster(self, window):
""" Cluster the dict array, assuming it has the relevant Coinc colums,
time1, time2, stat, and timeslide_id
"""
# If no events, do nothing
if len(self.time1) == 0 or len(self.time2) == 0:
return self
from pycbc.events import cluster_coincs
interval = self.attrs['timeslide_interval']
cid = cluster_coincs(self.stat, self.time1, self.time2,
self.timeslide_id, interval, window)
return self.select(cid)
[docs] def save(self, outname):
super(StatmapData, self).save(outname)
with HFile(outname, "w") as f:
for key in self.seg.keys():
f['segments/%s/start' % key] = self.seg[key]['start'][:]
f['segments/%s/end' % key] = self.seg[key]['end'][:]
[docs]class MultiifoStatmapData(StatmapData):
def __init__(self, data=None, seg=None, attrs=None,
files=None, ifos=None):
groups = ['decimation_factor', 'stat', 'template_id', 'timeslide_id']
for ifo in ifos:
groups += ['%s/time' % ifo]
groups += ['%s/trigger_id' % ifo]
super(MultiifoStatmapData, self).__init__(data=data, files=files,
groups=groups, attrs=attrs,
seg=seg)
def _return(self, data):
ifolist = self.attrs['ifos'].split(' ')
return self.__class__(data=data, attrs=self.attrs, seg=self.seg,
ifos=ifolist)
[docs] def cluster(self, window):
""" Cluster the dict array, assuming it has the relevant Coinc colums,
time1, time2, stat, and timeslide_id
"""
# If no events, do nothing
pivot_ifo = self.attrs['pivot']
fixed_ifo = self.attrs['fixed']
if len(self.data['%s/time' % pivot_ifo]) == 0 or len(self.data['%s/time' % fixed_ifo]) == 0:
return self
from pycbc.events import cluster_coincs
interval = self.attrs['timeslide_interval']
cid = cluster_coincs(self.stat,
self.data['%s/time' % pivot_ifo],
self.data['%s/time' % fixed_ifo],
self.timeslide_id,
interval,
window)
return self.select(cid)
[docs]class FileData(object):
def __init__(self, fname, group=None, columnlist=None, filter_func=None):
"""
Parameters
----------
group : string
Name of group to be read from the file
columnlist : list of strings
Names of columns to be read; if None, use all existing columns
filter_func : string
String should evaluate to a Boolean expression using attributes
of the class instance derived from columns: ex. 'self.snr < 6.5'
"""
if not fname: raise RuntimeError("Didn't get a file!")
self.fname = fname
self.h5file = HFile(fname, "r")
if group is None:
if len(self.h5file.keys()) == 1:
group, = self.h5file.keys()
else:
raise RuntimeError("Didn't get a group!")
self.group_key = group
self.group = self.h5file[group]
self.columns = columnlist if columnlist is not None \
else list(self.group.keys())
self.filter_func = filter_func
self._mask = None
[docs] def close(self):
self.h5file.close()
@property
def mask(self):
"""
Create a mask implementing the requested filter on the datasets
Returns
-------
array of Boolean
True for dataset indices to be returned by the get_column method
"""
if self.filter_func is None:
raise RuntimeError("Can't get a mask without a filter function!")
else:
# only evaluate if no previous calculation was done
if self._mask is None:
# get required columns into the namespace as numpy arrays
for column in self.columns:
if column in self.filter_func:
setattr(self, column, self.group[column][:])
self._mask = eval(self.filter_func)
return self._mask
[docs] def get_column(self, col):
"""
Parameters
----------
col : string
Name of the dataset to be returned
Returns
-------
numpy array
Values from the dataset, filtered if requested
"""
# catch corner case with an empty file (group with no datasets)
if not len(self.group.keys()):
return np.array([])
vals = self.group[col]
if self.filter_func:
return vals[self.mask]
else:
return vals[:]
[docs]class DataFromFiles(object):
def __init__(self, filelist, group=None, columnlist=None, filter_func=None):
self.files = filelist
self.group = group
self.columns = columnlist
self.filter_func = filter_func
[docs] def get_column(self, col):
"""
Loop over files getting the requested dataset values from each
Parameters
----------
col : string
Name of the dataset to be returned
Returns
-------
numpy array
Values from the dataset, filtered if requested and
concatenated in order of file list
"""
logging.info('getting %s' % col)
vals = []
for f in self.files:
d = FileData(f, group=self.group, columnlist=self.columns,
filter_func=self.filter_func)
vals.append(d.get_column(col))
# Close each file since h5py has an upper limit on the number of
# open file objects (approx. 1000)
d.close()
logging.info('- got %i values' % sum(len(v) for v in vals))
return np.concatenate(vals)
[docs]class SingleDetTriggers(object):
"""
Provides easy access to the parameters of single-detector CBC triggers.
"""
# FIXME: Some of these are optional and should be kwargs.
def __init__(self, trig_file, bank_file, veto_file,
segment_name, filter_func, detector, premask=None):
logging.info('Loading triggers')
self.trigs_f = HFile(trig_file, 'r')
self.trigs = self.trigs_f[detector]
self.ifo = detector # convenience attributes
self.detector = detector
if bank_file:
logging.info('Loading bank')
self.bank = HFile(bank_file, 'r')
else:
logging.info('No bank file given')
# empty dict in place of non-existent hdf file
self.bank = {}
if premask is not None:
self.mask = premask
else:
self.mask = np.ones(len(self.trigs['end_time']), dtype=bool)
if veto_file:
logging.info('Applying veto segments')
# veto_mask is an array of indices into the trigger arrays
# giving the surviving triggers
logging.info('%i triggers before vetoes', self.mask.sum())
self.veto_mask, _ = events.veto.indices_outside_segments(
self.end_time, [veto_file],
ifo=detector, segment_name=segment_name)
idx = np.flatnonzero(self.mask)[self.veto_mask]
self.mask[:] = False
self.mask[idx] = True
logging.info('%i triggers remain after vetoes',
len(self.veto_mask))
# FIXME this should use the hfile select interface to avoid
# memory and processing limitations.
if filter_func:
# get required columns into the namespace with dummy attribute
# names to avoid confusion with other class properties
logging.info('Setting up filter function')
for c in self.trigs.keys():
if c in filter_func:
setattr(self, '_'+c, self.trigs[c][:])
for c in self.bank.keys():
if c in filter_func:
# get template parameters corresponding to triggers
setattr(self, '_'+c,
np.array(self.bank[c])[self.trigs['template_id'][:]])
self.filter_mask = eval(filter_func.replace('self.', 'self._'))
# remove the dummy attributes
for c in chain(self.trigs.keys(), self.bank.keys()):
if c in filter_func: delattr(self, '_'+c)
self.mask = self.mask & self.filter_mask
logging.info('%i triggers remain after cut on %s',
sum(self.mask), filter_func)
def __getitem__(self, key):
# Is key in the TRIGGER_MERGE file?
try:
return self.get_column(key)
except KeyError:
pass
# Is key in the bank file?
try:
self.checkbank(key)
return self.bank[key][:][self.template_id]
except (RuntimeError, KeyError) as exc:
err_msg = "Cannot find {} in input files".format(key)
raise ValueError(err_msg) from exc
[docs] def checkbank(self, param):
if self.bank == {}:
return RuntimeError("Can't get %s values without a bank file"
% param)
[docs] def trig_dict(self):
"""Returns dict of the masked trigger valuse """
mtrigs = {}
for k in self.trigs:
if len(self.trigs[k]) == len(self.trigs['end_time']):
if self.mask is not None:
mtrigs[k] = self.trigs[k][self.mask]
else:
mtrigs[k] = self.trigs[k][:]
return mtrigs
[docs] @classmethod
def get_param_names(cls):
"""Returns a list of plottable CBC parameter variables"""
return [m[0] for m in inspect.getmembers(cls) \
if type(m[1]) == property]
[docs] def apply_mask(self, logic_mask):
"""Apply a boolean array to the set of triggers"""
if hasattr(self.mask, 'dtype') and (self.mask.dtype == 'bool'):
orig_indices = self.mask.nonzero()[0][logic_mask]
self.mask[:] = False
self.mask[orig_indices] = True
else:
self.mask = list(np.array(self.mask)[logic_mask])
[docs] def mask_to_n_loudest_clustered_events(self, rank_method,
n_loudest=10,
cluster_window=10):
"""Edits the mask property of the class to point to the N loudest
single detector events as ranked by ranking statistic. Events are
clustered so that no more than 1 event within +/- cluster-window will
be considered."""
# If this becomes memory intensive we can optimize
stat = rank_method.rank_stat_single((self.ifo, self.trig_dict()))
if len(stat) == 0:
# No triggers, so just return here
self.stat = np.array([])
return
times = self.end_time
index = stat.argsort()[::-1]
new_times = []
new_index = []
for curr_idx in index:
curr_time = times[curr_idx]
for time in new_times:
if abs(curr_time - time) < cluster_window:
break
else:
# Only get here if no other triggers within cluster window
new_index.append(curr_idx)
new_times.append(curr_time)
if len(new_index) >= n_loudest:
break
index = np.array(new_index)
index.sort()
self.stat = stat[index]
if hasattr(self.mask, 'dtype') and self.mask.dtype == 'bool':
orig_indices = np.flatnonzero(self.mask)[index]
self.mask = list(orig_indices)
elif isinstance(self.mask, list):
self.mask = list(np.array(self.mask)[index])
@property
def template_id(self):
return self.get_column('template_id')
@property
def mass1(self):
self.checkbank('mass1')
return self.bank['mass1'][:][self.template_id]
@property
def mass2(self):
self.checkbank('mass2')
return self.bank['mass2'][:][self.template_id]
@property
def spin1z(self):
self.checkbank('spin1z')
return self.bank['spin1z'][:][self.template_id]
@property
def spin2z(self):
self.checkbank('spin2z')
return self.bank['spin2z'][:][self.template_id]
@property
def spin2x(self):
self.checkbank('spin2x')
return self.bank['spin2x'][:][self.template_id]
@property
def spin2y(self):
self.checkbank('spin2y')
return self.bank['spin2y'][:][self.template_id]
@property
def spin1x(self):
self.checkbank('spin1x')
return self.bank['spin1x'][:][self.template_id]
@property
def spin1y(self):
self.checkbank('spin1y')
return self.bank['spin1y'][:][self.template_id]
@property
def inclination(self):
self.checkbank('inclination')
return self.bank['inclination'][:][self.template_id]
@property
def f_lower(self):
self.checkbank('f_lower')
return self.bank['f_lower'][:][self.template_id]
@property
def approximant(self):
self.checkbank('approximant')
return self.bank['approximant'][:][self.template_id]
@property
def mtotal(self):
return self.mass1 + self.mass2
@property
def mchirp(self):
return conversions.mchirp_from_mass1_mass2(self.mass1, self.mass2)
@property
def eta(self):
return conversions.eta_from_mass1_mass2(self.mass1, self.mass2)
@property
def effective_spin(self):
# FIXME assumes aligned spins
return conversions.chi_eff(self.mass1, self.mass2,
self.spin1z, self.spin2z)
# IMPROVEME: would like to have a way to access all get_freq and/or
# other pnutils.* names rather than hard-coding each one
# - eg make this part of a fancy interface to the bank file ?
@property
def f_seobnrv2_peak(self):
return pnutils.get_freq('fSEOBNRv2Peak', self.mass1, self.mass2,
self.spin1z, self.spin2z)
@property
def f_seobnrv4_peak(self):
return pnutils.get_freq('fSEOBNRv4Peak', self.mass1, self.mass2,
self.spin1z, self.spin2z)
@property
def end_time(self):
return self.get_column('end_time')
@property
def template_duration(self):
return self.get_column('template_duration')
@property
def snr(self):
return self.get_column('snr')
@property
def sgchisq(self):
return self.get_column('sg_chisq')
@property
def u_vals(self):
return self.get_column('u_vals')
@property
def rchisq(self):
return self.get_column('chisq') \
/ (self.get_column('chisq_dof') * 2 - 2)
@property
def psd_var_val(self):
return self.get_column('psd_var_val')
@property
def newsnr(self):
return ranking.newsnr(self.snr, self.rchisq)
@property
def newsnr_sgveto(self):
return ranking.newsnr_sgveto(self.snr, self.rchisq, self.sgchisq)
@property
def newsnr_sgveto_psdvar(self):
return ranking.newsnr_sgveto_psdvar(self.snr, self.rchisq,
self.sgchisq, self.psd_var_val)
@property
def newsnr_sgveto_psdvar_threshold(self):
return ranking.newsnr_sgveto_psdvar_threshold(self.snr, self.rchisq,
self.sgchisq, self.psd_var_val)
[docs] def get_ranking(self, rank_name, **kwargs):
return ranking.get_sngls_ranking_from_trigs(self, rank_name, **kwargs)
[docs] def get_column(self, cname):
# Fiducial value that seems to work, not extensively tuned.
MFRAC = 0.3
# If the mask accesses few enough elements then directly use it
# This can be slower than reading in all the elements if most of them
# will be read.
if self.mask is not None and (isinstance(self.mask, list) or \
(len(self.mask.nonzero()[0]) < (len(self.mask) * MFRAC))):
return self.trigs[cname][self.mask]
# We have a lot of elements to read so we resort to readin the entire
# array before masking.
elif self.mask is not None:
return self.trigs[cname][:][self.mask]
else:
return self.trigs[cname][:]
[docs]class ForegroundTriggers(object):
# FIXME: A lot of this is hardcoded to expect two ifos
def __init__(self, coinc_file, bank_file, sngl_files=None, n_loudest=None,
group='foreground'):
self.coinc_file = FileData(coinc_file, group=group)
if 'ifos' in self.coinc_file.h5file.attrs:
self.ifos = self.coinc_file.h5file.attrs['ifos'].split(' ')
else:
self.ifos = [self.coinc_file.h5file.attrs['detector_1'],
self.coinc_file.h5file.attrs['detector_2']]
self.sngl_files = {}
if sngl_files is not None:
for sngl_file in sngl_files:
curr_dat = FileData(sngl_file)
curr_ifo = curr_dat.group_key
self.sngl_files[curr_ifo] = curr_dat
if not all([ifo in self.sngl_files.keys() for ifo in self.ifos]):
print("sngl_files: {}".format(sngl_files))
print("self.ifos: {}".format(self.ifos))
raise RuntimeError("IFOs in statmap file not all represented "
"by single-detector trigger files.")
if not sorted(self.sngl_files.keys()) == sorted(self.ifos):
logging.warning("WARNING: Single-detector trigger files "
"given for IFOs not in the statmap file")
self.bank_file = HFile(bank_file, "r")
self.n_loudest = n_loudest
self._sort_arr = None
self._template_id = None
self._trig_ids = None
self.get_active_segments()
@property
def sort_arr(self):
if self._sort_arr is None:
ifar = self.coinc_file.get_column('ifar')
sorting = ifar.argsort()[::-1]
if self.n_loudest:
sorting = sorting[:self.n_loudest]
self._sort_arr = sorting
return self._sort_arr
@property
def template_id(self):
if self._template_id is None:
template_id = self.get_coincfile_array('template_id')
self._template_id = template_id
return self._template_id
@property
def trig_id(self):
if self._trig_ids is not None:
return self._trig_ids
self._trig_ids = {}
ifos = self.coinc_file.h5file.attrs['ifos'].split(' ')
for ifo in ifos:
trigid = self.get_coincfile_array(ifo + '/trigger_id')
self._trig_ids[ifo] = trigid
return self._trig_ids
[docs] def get_coincfile_array(self, variable):
return self.coinc_file.get_column(variable)[self.sort_arr]
[docs] def get_bankfile_array(self, variable):
try:
return self.bank_file[variable][:][self.template_id]
except IndexError:
if len(self.template_id) == 0:
return np.array([])
raise
[docs] def get_snglfile_array_dict(self, variable):
return_dict = {}
for ifo in self.ifos:
try:
tid = self.trig_id[ifo]
lgc = tid == -1
# Put in *some* value for the invalid points to avoid failure
# Make sure this doesn't change the cached internal array!
tid = np.copy(tid)
tid[lgc] = 0
# If small number of points don't read the full file
if len(tid) < 1000:
curr = []
hdf_dataset = self.sngl_files[ifo].group[variable]
for idx in tid:
curr.append(hdf_dataset[idx])
curr = np.array(curr)
else:
curr = self.sngl_files[ifo].get_column(variable)[tid]
except IndexError:
if len(self.trig_id[ifo]) == 0:
curr = np.array([])
lgc = curr == 0
else:
raise
return_dict[ifo] = (curr, np.logical_not(lgc))
return return_dict
[docs] def get_active_segments(self):
self.active_segments = {}
for ifo in self.ifos:
starts = self.sngl_files[ifo].get_column('search/start_time')
ends = self.sngl_files[ifo].get_column('search/end_time')
self.active_segments[ifo] = veto.start_end_to_segments(starts,
ends)
[docs] def get_end_time(self):
ifos = self.coinc_file.h5file.attrs['ifos'].split(' ')
times_gen = (self.get_coincfile_array('{}/time'.format(ifo))
for ifo in ifos)
ref_times = np.array([mean_if_greater_than_zero(t)[0]
for t in zip(*times_gen)])
return ref_times
[docs] def get_ifos(self):
ifo_list = []
n_trigs = len(self.get_coincfile_array('template_id'))
try: # First try new-style format
ifos = self.coinc_file.h5file.attrs['ifos'].split(' ')
for ifo in ifos:
ifo_trigs = np.where(self.get_coincfile_array(ifo+'/time') < 0,
'-', ifo)
ifo_list.append(ifo_trigs)
ifo_list = [list(trig[trig != '-']) \
for trig in iter(np.array(ifo_list).T)]
except KeyError: # Else fall back on old two-det format
# Currently assumes two-det is Hanford and Livingston
ifo_list = [['H1', 'L1'] for i in range(n_trigs)]
return ifo_list
[docs] def to_coinc_xml_object(self, file_name):
outdoc = ligolw.Document()
outdoc.appendChild(ligolw.LIGO_LW())
ifos = list(self.sngl_files.keys())
proc_id = ligolw_process.register_to_xmldoc(outdoc, 'pycbc',
{}, instruments=ifos, comment='', version=pycbc_version.git_hash,
cvs_repository='pycbc/'+pycbc_version.git_branch,
cvs_entry_time=pycbc_version.date).process_id
search_summ_table = lsctables.New(lsctables.SearchSummaryTable)
coinc_h5file = self.coinc_file.h5file
try:
start_time = coinc_h5file['segments']['coinc']['start'][:].min()
end_time = coinc_h5file['segments']['coinc']['end'][:].max()
except KeyError:
start_times = []
end_times = []
for ifo_comb in coinc_h5file['segments']:
if ifo_comb == 'foreground_veto':
continue
seg_group = coinc_h5file['segments'][ifo_comb]
start_times.append(seg_group['start'][:].min())
end_times.append(seg_group['end'][:].max())
start_time = min(start_times)
end_time = max(end_times)
num_trigs = len(self.sort_arr)
search_summary = return_search_summary(start_time, end_time,
num_trigs, ifos)
search_summ_table.append(search_summary)
outdoc.childNodes[0].appendChild(search_summ_table)
sngl_inspiral_table = lsctables.New(lsctables.SnglInspiralTable)
coinc_def_table = lsctables.New(lsctables.CoincDefTable)
coinc_event_table = lsctables.New(lsctables.CoincTable)
coinc_inspiral_table = lsctables.New(lsctables.CoincInspiralTable)
coinc_event_map_table = lsctables.New(lsctables.CoincMapTable)
time_slide_table = lsctables.New(lsctables.TimeSlideTable)
# Set up time_slide table
time_slide_id = lsctables.TimeSlideID(0)
for ifo in ifos:
time_slide_row = lsctables.TimeSlide()
time_slide_row.instrument = ifo
time_slide_row.time_slide_id = time_slide_id
time_slide_row.offset = 0
time_slide_row.process_id = proc_id
time_slide_table.append(time_slide_row)
# Set up coinc_definer table
coinc_def_id = lsctables.CoincDefID(0)
coinc_def_row = lsctables.CoincDef()
coinc_def_row.search = "inspiral"
coinc_def_row.description = \
"sngl_inspiral<-->sngl_inspiral coincidences"
coinc_def_row.coinc_def_id = coinc_def_id
coinc_def_row.search_coinc_type = 0
coinc_def_table.append(coinc_def_row)
bank_col_names = ['mass1', 'mass2', 'spin1z', 'spin2z']
bank_col_vals = {}
for name in bank_col_names:
bank_col_vals[name] = self.get_bankfile_array(name)
coinc_event_names = ['ifar', 'time', 'fap', 'stat']
coinc_event_vals = {}
for name in coinc_event_names:
if name == 'time':
coinc_event_vals[name] = self.get_end_time()
else:
coinc_event_vals[name] = self.get_coincfile_array(name)
sngl_col_names = ['snr', 'chisq', 'chisq_dof', 'bank_chisq',
'bank_chisq_dof', 'cont_chisq', 'cont_chisq_dof',
'end_time', 'template_duration', 'coa_phase',
'sigmasq']
sngl_col_vals = {}
for name in sngl_col_names:
sngl_col_vals[name] = self.get_snglfile_array_dict(name)
sngl_event_count = 0
for idx in range(len(self.sort_arr)):
# Set up IDs and mapping values
coinc_id = lsctables.CoincID(idx)
# Set up sngls
sngl_mchirps = []
sngl_mtots = []
net_snrsq = 0
triggered_ifos = []
for ifo in ifos:
# If this ifo is not participating in this coincidence then
# ignore it and move on.
if not sngl_col_vals['snr'][ifo][1][idx]:
continue
triggered_ifos += [ifo]
event_id = lsctables.SnglInspiralID(sngl_event_count)
sngl_event_count += 1
sngl = return_empty_sngl()
sngl.event_id = event_id
sngl.ifo = ifo
net_snrsq += sngl_col_vals['snr'][ifo][0][idx]**2
for name in sngl_col_names:
val = sngl_col_vals[name][ifo][0][idx]
if name == 'end_time':
sngl.end = LIGOTimeGPS(val)
else:
setattr(sngl, name, val)
for name in bank_col_names:
val = bank_col_vals[name][idx]
setattr(sngl, name, val)
sngl.mtotal, sngl.eta = pnutils.mass1_mass2_to_mtotal_eta(
sngl.mass1, sngl.mass2)
sngl.mchirp, _ = pnutils.mass1_mass2_to_mchirp_eta(
sngl.mass1, sngl.mass2)
sngl.eff_distance = (sngl.sigmasq)**0.5 / sngl.snr
# If exact match is not used, then take mean
# masses from the single triggers
sngl_mchirps += [sngl.mchirp]
sngl_mtots += [sngl.mtotal]
sngl_inspiral_table.append(sngl)
# Set up coinc_map entry
coinc_map_row = lsctables.CoincMap()
coinc_map_row.table_name = 'sngl_inspiral'
coinc_map_row.coinc_event_id = coinc_id
coinc_map_row.event_id = event_id
coinc_event_map_table.append(coinc_map_row)
sngl_combined_mchirp = np.mean(sngl_mchirps)
sngl_combined_mtot = np.mean(sngl_mtots)
# Set up coinc inspiral and coinc event tables
coinc_event_row = lsctables.Coinc()
coinc_inspiral_row = lsctables.CoincInspiral()
coinc_event_row.coinc_def_id = coinc_def_id
coinc_event_row.nevents = len(triggered_ifos)
# note that simply `coinc_event_row.instruments = triggered_ifos`
# does not lead to a correct result with ligo.lw 1.7.1
coinc_event_row.instruments = ','.join(sorted(triggered_ifos))
coinc_inspiral_row.instruments = triggered_ifos
coinc_event_row.time_slide_id = time_slide_id
coinc_event_row.process_id = proc_id
coinc_event_row.coinc_event_id = coinc_id
coinc_inspiral_row.coinc_event_id = coinc_id
coinc_inspiral_row.mchirp = sngl_combined_mchirp
coinc_inspiral_row.mass = sngl_combined_mtot
coinc_inspiral_row.end = LIGOTimeGPS(coinc_event_vals['time'][idx])
coinc_inspiral_row.snr = net_snrsq**0.5
coinc_inspiral_row.false_alarm_rate = coinc_event_vals['fap'][idx]
coinc_inspiral_row.combined_far = 1./coinc_event_vals['ifar'][idx]
# Transform to Hz
coinc_inspiral_row.combined_far = \
coinc_inspiral_row.combined_far / YRJUL_SI
coinc_event_row.likelihood = coinc_event_vals['stat'][idx]
coinc_inspiral_row.minimum_duration = 0.
coinc_event_table.append(coinc_event_row)
coinc_inspiral_table.append(coinc_inspiral_row)
outdoc.childNodes[0].appendChild(coinc_def_table)
outdoc.childNodes[0].appendChild(coinc_event_table)
outdoc.childNodes[0].appendChild(coinc_event_map_table)
outdoc.childNodes[0].appendChild(time_slide_table)
outdoc.childNodes[0].appendChild(coinc_inspiral_table)
outdoc.childNodes[0].appendChild(sngl_inspiral_table)
ligolw_utils.write_filename(outdoc, file_name)
[docs] def to_coinc_hdf_object(self, file_name):
ofd = h5py.File(file_name,'w')
# Some fields are special cases:
logging.info("Outputting search results")
time = self.get_end_time()
ofd.create_dataset('time', data=time, dtype=np.float32)
ifar = self.get_coincfile_array('ifar')
ofd.create_dataset('ifar', data=ifar, dtype=np.float32)
ifar_exc = self.get_coincfile_array('ifar_exc')
ofd.create_dataset('ifar_exclusive', data=ifar_exc,
dtype=np.float32)
fap = self.get_coincfile_array('fap')
ofd.create_dataset('p_value', data=fap,
dtype=np.float32)
fap_exc = self.get_coincfile_array('fap_exc')
ofd.create_dataset('p_value_exclusive', data=fap_exc,
dtype=np.float32)
# Coinc fields
for field in ['stat']:
vals = self.get_coincfile_array(field)
ofd.create_dataset(field, data=vals, dtype=np.float32)
logging.info("Outputting template information")
# Bank fields
for field in ['mass1','mass2','spin1z','spin2z']:
vals = self.get_bankfile_array(field)
ofd.create_dataset(field, data=vals, dtype=np.float32)
mass1 = self.get_bankfile_array('mass1')
mass2 = self.get_bankfile_array('mass2')
mchirp, _ = mass1_mass2_to_mchirp_eta(mass1, mass2)
ofd.create_dataset('chirp_mass', data=mchirp, dtype=np.float32)
logging.info("Outputting single-trigger information")
logging.info("reduced chisquared")
chisq_vals_valid = self.get_snglfile_array_dict('chisq')
chisq_dof_vals_valid = self.get_snglfile_array_dict('chisq_dof')
for ifo in self.ifos:
chisq_vals = chisq_vals_valid[ifo][0]
chisq_valid = chisq_vals_valid[ifo][1]
chisq_dof_vals = chisq_dof_vals_valid[ifo][0]
rchisq = chisq_vals / (2. * chisq_dof_vals - 2.)
rchisq[np.logical_not(chisq_valid)] = -1.
ofd.create_dataset(ifo + '_chisq', data=rchisq,
dtype=np.float64)
# Single-detector fields
for field in ['sg_chisq', 'end_time', 'sigmasq',
'psd_var_val']:
logging.info(field)
try:
vals_valid = self.get_snglfile_array_dict(field)
except KeyError:
logging.info(field + " is not present in the "
"single-detector files")
for ifo in self.ifos:
vals = vals_valid[ifo][0]
valid = vals_valid[ifo][1]
vals[np.logical_not(valid)] = -1.
ofd.create_dataset(ifo + '_'+field, data=vals,
dtype=np.float32)
snr_vals_valid = self.get_snglfile_array_dict('snr')
network_snr_sq = np.zeros_like(snr_vals_valid[self.ifos[0]][0])
for ifo in self.ifos:
vals = snr_vals_valid[ifo][0]
valid = snr_vals_valid[ifo][1]
vals[np.logical_not(valid)] = -1.
ofd.create_dataset(ifo + '_snr', data=vals,
dtype=np.float32)
network_snr_sq[valid] += vals[valid] ** 2.0
network_snr = np.sqrt(network_snr_sq)
ofd.create_dataset('network_snr', data=network_snr, dtype=np.float32)
logging.info("Triggered detectors")
# This creates a n_ifos by n_events matrix, with the ifo letter
# if the event contains a trigger from the ifo, empty string if not
triggered_matrix = [[ifo[0] if v else ''
for v in snr_vals_valid[ifo][1]]
for ifo in self.ifos]
# This combines the ifo letters to make a single string per event
triggered_detectors = [''.join(triggered).encode('ascii')
for triggered in zip(*triggered_matrix)]
ofd.create_dataset('trig', data=triggered_detectors,
dtype='<S3')
logging.info("active detectors")
# This creates a n_ifos by n_events matrix, with the ifo letter
# if the ifo was active at the event time, empty string if not
active_matrix = [[ifo[0] if t in self.active_segments[ifo]
else '' for t in time]
for ifo in self.ifos]
# This combines the ifo letters to make a single string per event
active_detectors = [''.join(active_at_time).encode('ascii')
for active_at_time in zip(*active_matrix)]
ofd.create_dataset('obs', data=active_detectors,
dtype='<S3')
ofd.close()
[docs]class ReadByTemplate(object):
# default assignment to {} is OK for a variable used only in __init__
def __init__(self, filename, bank=None, segment_name=None, veto_files=None,
gating_veto_windows={}):
self.filename = filename
self.file = h5py.File(filename, 'r')
self.ifo = tuple(self.file.keys())[0]
self.valid = None
self.bank = h5py.File(bank, 'r') if bank else {}
# Determine the segments which define the boundaries of valid times
# to use triggers
key = '%s/search/' % self.ifo
s, e = self.file[key + 'start_time'][:], self.file[key + 'end_time'][:]
self.segs = veto.start_end_to_segments(s, e).coalesce()
if segment_name is None:
segment_name = []
if veto_files is None:
veto_files = []
for vfile, name in zip(veto_files, segment_name):
veto_segs = veto.select_segments_by_definer(vfile, ifo=self.ifo,
segment_name=name)
self.segs = (self.segs - veto_segs).coalesce()
if self.ifo in gating_veto_windows:
gating_veto = gating_veto_windows[self.ifo].split(',')
gveto_before = float(gating_veto[0])
gveto_after = float(gating_veto[1])
if gveto_before > 0 or gveto_after < 0:
raise ValueError("Gating veto window values must be negative "
"before gates and positive after gates.")
if not (gveto_before == 0 and gveto_after == 0):
gate_times = np.unique(
self.file[self.ifo + '/gating/auto/time'][:])
gating_veto_segs = veto.start_end_to_segments(gate_times + gveto_before,
gate_times + gveto_after).coalesce()
self.segs = (self.segs - gating_veto_segs).coalesce()
self.valid = veto.segments_to_start_end(self.segs)
[docs] def get_data(self, col, num):
"""Get a column of data for template with id 'num'.
Parameters
----------
col: str
Name of column to read
num: int
The template id to read triggers for
Returns
-------
data: numpy.ndarray
The requested column of data
"""
ref = self.file['%s/%s_template' % (self.ifo, col)][num]
return self.file['%s/%s' % (self.ifo, col)][ref]
[docs] def set_template(self, num):
"""Set the active template to read from.
Parameters
----------
num: int
The template id to read triggers for.
Returns
-------
trigger_id: numpy.ndarray
The indices of this templates triggers.
"""
self.template_num = num
times = self.get_data('end_time', num)
# Determine which of these template's triggers are kept after
# applying vetoes
if self.valid:
self.keep = veto.indices_within_times(times, self.valid[0],
self.valid[1])
# logging.info('applying vetoes')
else:
self.keep = np.arange(0, len(times))
if self.bank != {}:
self.param = {}
if 'parameters' in self.bank.attrs:
for col in self.bank.attrs['parameters']:
self.param[col] = self.bank[col][self.template_num]
else:
for col in self.bank:
self.param[col] = self.bank[col][self.template_num]
# Calculate the trigger id by adding the relative offset in self.keep
# to the absolute beginning index of this templates triggers stored
# in 'template_boundaries'
trigger_id = self.keep + \
self.file['%s/template_boundaries' % self.ifo][num]
return trigger_id
def __getitem__(self, col):
""" Return the column of data for current active template after
applying vetoes
Parameters
----------
col: str
Name of column to read
Returns
-------
data: numpy.ndarray
The requested column of data
"""
if self.template_num is None:
raise ValueError('You must call set_template to first pick the '
'template to read data from')
data = self.get_data(col, self.template_num)
data = data[self.keep] if self.valid else data
return data
chisq_choices = ['traditional', 'cont', 'bank', 'max_cont_trad', 'sg',
'max_bank_cont', 'max_bank_trad', 'max_bank_cont_trad']
[docs]def get_chisq_from_file_choice(hdfile, chisq_choice):
f = hdfile
if chisq_choice in ['traditional','max_cont_trad', 'max_bank_trad',
'max_bank_cont_trad']:
trad_chisq = f['chisq'][:]
# We now need to handle the case where chisq is not actually calculated
# 0 is used as a sentinel value
trad_chisq_dof = f['chisq_dof'][:]
trad_chisq /= (trad_chisq_dof * 2 - 2)
if chisq_choice in ['cont', 'max_cont_trad', 'max_bank_cont',
'max_bank_cont_trad']:
cont_chisq = f['cont_chisq'][:]
cont_chisq_dof = f['cont_chisq_dof'][:]
cont_chisq /= cont_chisq_dof
if chisq_choice in ['bank', 'max_bank_cont', 'max_bank_trad',
'max_bank_cont_trad']:
bank_chisq = f['bank_chisq'][:]
bank_chisq_dof = f['bank_chisq_dof'][:]
bank_chisq /= bank_chisq_dof
if chisq_choice == 'sg':
chisq = f['sg_chisq'][:]
elif chisq_choice == 'traditional':
chisq = trad_chisq
elif chisq_choice == 'cont':
chisq = cont_chisq
elif chisq_choice == 'bank':
chisq = bank_chisq
elif chisq_choice == 'max_cont_trad':
chisq = np.maximum(trad_chisq, cont_chisq)
elif chisq_choice == 'max_bank_cont':
chisq = np.maximum(bank_chisq, cont_chisq)
elif chisq_choice == 'max_bank_trad':
chisq = np.maximum(bank_chisq, trad_chisq)
elif chisq_choice == 'max_bank_cont_trad':
chisq = np.maximum(np.maximum(bank_chisq, cont_chisq), trad_chisq)
else:
err_msg = "Do not recognize --chisq-choice %s" % chisq_choice
raise ValueError(err_msg)
return chisq
[docs]def save_dict_to_hdf5(dic, filename):
"""
Parameters
----------
dic:
python dictionary to be converted to hdf5 format
filename:
desired name of hdf5 file
"""
with h5py.File(filename, 'w') as h5file:
recursively_save_dict_contents_to_group(h5file, '/', dic)
[docs]def recursively_save_dict_contents_to_group(h5file, path, dic):
"""
Parameters
----------
h5file:
h5py file to be written to
path:
path within h5py file to saved dictionary
dic:
python dictionary to be converted to hdf5 format
"""
for key, item in dic.items():
if isinstance(item, (np.ndarray, np.int64, np.float64, str, int, float,
bytes, tuple, list)):
h5file[path + str(key)] = item
elif isinstance(item, dict):
recursively_save_dict_contents_to_group(h5file, path + key + '/', item)
else:
raise ValueError('Cannot save %s type' % type(item))
[docs]def load_hdf5_to_dict(h5file, path):
"""
Parameters
----------
h5file:
h5py file to be loaded as a dictionary
path:
path within h5py file to load: '/' for the whole h5py file
Returns
-------
dic:
dictionary with hdf5 file group content
"""
dic = {}
for key, item in h5file[path].items():
if isinstance(item, h5py.Dataset):
dic[key] = item[()]
elif isinstance(item, h5py.Group):
dic[key] = load_hdf5_to_dict(h5file, path + key + '/')
else:
raise ValueError('Cannot load %s type' % type(item))
return dic
[docs]def combine_and_copy(f, files, group):
""" Combine the same column from multiple files and save to a third"""
# ensure that the files input is stable for iteration order
assert isinstance(files, (list, tuple))
f[group] = np.concatenate([fi[group][:] if group in fi else \
np.array([], dtype=np.uint32) for fi in files])
[docs]def name_all_datasets(files):
assert isinstance(files, (list, tuple))
datasets = []
for fi in files:
datasets += get_all_subkeys(fi, '/')
return set(datasets)
[docs]def get_all_subkeys(grp, key):
subkey_list = []
subkey_start = key
if key == '':
grpk = grp
else:
grpk = grp[key]
for sk in grpk.keys():
path = subkey_start + '/' + sk
if isinstance(grp[path], h5py.Dataset):
subkey_list.append(path.lstrip('/'))
else:
subkey_list += get_all_subkeys(grp, path)
# returns an empty list if there is no dataset or subgroup within the group
return subkey_list
#
# =============================================================================
#
# Checkpointing utilities
#
# =============================================================================
#
[docs]def dump_state(state, fp, path=None, dsetname='state',
protocol=pickle.HIGHEST_PROTOCOL):
"""Dumps the given state to an hdf5 file handler.
The state is stored as a raw binary array to ``{path}/{dsetname}`` in the
given hdf5 file handler. If a dataset with the same name and path is
already in the file, the dataset will be resized and overwritten with the
new state data.
Parameters
----------
state : any picklable object
The sampler state to dump to file. Can be the object returned by
any of the samplers' `.state` attribute (a dictionary of dictionaries),
or any picklable object.
fp : h5py.File
An open hdf5 file handler. Must have write capability enabled.
path : str, optional
The path (group name) to store the state dataset to. Default (None)
will result in the array being stored to the top level.
dsetname : str, optional
The name of the dataset to store the binary array to. Default is
``state``.
protocol : int, optional
The protocol version to use for pickling. See the :py:mod:`pickle`
module for more details.
"""
memfp = BytesIO()
pickle.dump(state, memfp, protocol=protocol)
dump_pickle_to_hdf(memfp, fp, path=path, dsetname=dsetname)
[docs]def dump_pickle_to_hdf(memfp, fp, path=None, dsetname='state'):
"""Dumps pickled data to an hdf5 file object.
Parameters
----------
memfp : file object
Bytes stream of pickled data.
fp : h5py.File
An open hdf5 file handler. Must have write capability enabled.
path : str, optional
The path (group name) to store the state dataset to. Default (None)
will result in the array being stored to the top level.
dsetname : str, optional
The name of the dataset to store the binary array to. Default is
``state``.
"""
memfp.seek(0)
bdata = np.frombuffer(memfp.read(), dtype='S1')
if path is not None:
dsetname = path + '/' + dsetname
if dsetname not in fp:
fp.create_dataset(dsetname, shape=bdata.shape, maxshape=(None,),
dtype=bdata.dtype)
elif bdata.size != fp[dsetname].shape[0]:
fp[dsetname].resize((bdata.size,))
fp[dsetname][:] = bdata
[docs]def load_state(fp, path=None, dsetname='state'):
"""Loads a sampler state from the given hdf5 file object.
The sampler state is expected to be stored as a raw bytes array which can
be loaded by pickle.
Parameters
----------
fp : h5py.File
An open hdf5 file handler.
path : str, optional
The path (group name) that the state data is stored to. Default (None)
is to read from the top level.
dsetname : str, optional
The name of the dataset that the state data is stored to. Default is
``state``.
"""
if path is not None:
fp = fp[path]
bdata = fp[dsetname][()].tobytes()
return pickle.load(BytesIO(bdata))
__all__ = ('HFile', 'DictArray', 'StatmapData', 'MultiifoStatmapData',
'FileData', 'DataFromFiles', 'SingleDetTriggers',
'ForegroundTriggers', 'ReadByTemplate', 'chisq_choices',
'get_chisq_from_file_choice', 'save_dict_to_hdf5',
'recursively_save_dict_contents_to_group', 'load_hdf5_to_dict',
'combine_and_copy', 'name_all_datasets', 'get_all_subkeys',
'dump_state', 'dump_pickle_to_hdf', 'load_state')