Source code for pycbc.workflow.pegasus_workflow

# Copyright (C) 2014  Alex Nitz
#
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


#
# =============================================================================
#
#                                   Preamble
#
# =============================================================================
#
""" This module provides thin wrappers around Pegasus.DAX3 functionality that
provides additional abstraction and argument handling.
"""
import os
from six.moves.urllib.request import pathname2url
from six.moves.urllib.parse import urljoin, urlsplit
from Pegasus.catalogs.transformation_catalog import TransformationCatalog
import Pegasus.DAX3 as dax

[docs]class ProfileShortcuts(object): """ Container of common methods for setting pegasus profile information on Executables and nodes. This class expects to be inherited from and for a add_profile method to be implemented. """
[docs] def set_memory(self, size): """ Set the amount of memory that is required in megabytes """ self.add_profile('condor', 'request_memory', '%sM' % size)
[docs] def set_storage(self, size): """ Set the amount of storage required in megabytes """ self.add_profile('condor', 'request_disk', '%sM' % size)
[docs] def set_num_cpus(self, number): self.add_profile('condor', 'request_cpus', number)
[docs] def set_universe(self, universe): if universe is 'standard': self.add_profile("pegasus", "gridstart", "none") self.add_profile("condor", "universe", universe)
[docs] def set_category(self, category): self.add_profile('dagman', 'category', category)
[docs] def set_priority(self, priority): self.add_profile('dagman', 'priority', priority)
[docs] def set_num_retries(self, number): self.add_profile("dagman", "retry", number)
[docs]class Executable(ProfileShortcuts): """ The workflow representation of an Executable """ id = 0 def __init__(self, name, namespace=None, os='linux', arch='x86_64', installed=True, version=None, container=None): self.logical_name = name + "_ID%s" % str(Executable.id) Executable.id += 1 self.namespace = namespace self.version = version if container: self._dax_executable = dax.Executable(self.logical_name, namespace=self.namespace, version=version, os=os, arch=arch, installed=installed, container=container) else: self._dax_executable = dax.Executable(self.logical_name, namespace=self.namespace, version=version, os=os, arch=arch, installed=installed) self.in_workflow = False self.pfns = {}
[docs] def clear_pfns(self): self._dax_executable.clearPFNs()
[docs] def add_pfn(self, url, site='local'): self._dax_executable.PFN(url, site) self.pfns[site] = url
[docs] def get_pfn(self, site='local'): return self.pfns[site]
[docs] def insert_into_dax(self, dax): dax.addExecutable(self._dax_executable)
[docs] def add_profile(self, namespace, key, value, force=False): """ Add profile information to this executable """ try: entry = dax.Profile(namespace, key, value) self._dax_executable.addProfile(entry) except dax.DuplicateError: if force: # Replace with the new key self._dax_executable.removeProfile(entry) self._dax_executable.addProfile(entry)
[docs] def is_same_as(self, other): test_vals = ['namespace', 'version', 'arch', 'os', 'osrelease', 'osversion', 'glibc', 'installed', 'container'] # Check for logical name first if not self.pegasus_name == other.pegasus_name: return False # Check the properties of the executable for val in test_vals: sattr = getattr(self._dax_executable, val) oattr = getattr(other._dax_executable, val) if not sattr == oattr: return False # Also check the "profile". This is things like Universe, RAM/disk/CPU # requests, execution site, getenv=True, etc. for profile in self._dax_executable.profiles: if profile not in other._dax_executable.profiles: return False for profile in other._dax_executable.profiles: if profile not in self._dax_executable.profiles: return False return True
[docs]class Node(ProfileShortcuts): def __init__(self, executable): self.in_workflow = False self.executable=executable self._inputs = [] self._outputs = [] self._dax_node = dax.Job(name=executable.logical_name, version = executable.version, namespace=executable.namespace) self._args = [] # Each value in _options is added separated with whitespace # so ['--option','value'] --> "--option value" self._options = [] # For _raw_options *NO* whitespace is added. # so ['--option','value'] --> "--optionvalue" # and ['--option',' ','value'] --> "--option value" self._raw_options = []
[docs] def add_arg(self, arg): """ Add an argument """ if not isinstance(arg, File): arg = str(arg) self._args += [arg]
[docs] def add_raw_arg(self, arg): """ Add an argument to the command line of this job, but do *NOT* add white space between arguments. This can be added manually by adding ' ' if needed """ if not isinstance(arg, File): arg = str(arg) self._raw_options += [arg]
[docs] def add_opt(self, opt, value=None): """ Add a option """ if value is not None: if not isinstance(value, File): value = str(value) self._options += [opt, value] else: self._options += [opt]
[docs] def add_input(self, inp): """Declares an input file without adding it as a command-line option. """ self._add_input(inp)
#private functions to add input and output data sources/sinks def _add_input(self, inp): """ Add as source of input data """ self._inputs += [inp] inp._set_as_input_of(self) def _add_output(self, out): """ Add as destination of output data """ self._outputs += [out] out.node = self out._set_as_output_of(self) # public functions to add options, arguments with or without data sources
[docs] def add_input_opt(self, opt, inp): """ Add an option that determines an input """ self.add_opt(opt, inp._dax_repr()) self._add_input(inp)
[docs] def add_output_opt(self, opt, out): """ Add an option that determines an output """ self.add_opt(opt, out._dax_repr()) self._add_output(out)
[docs] def add_output_list_opt(self, opt, outputs): """ Add an option that determines a list of outputs """ self.add_opt(opt) for out in outputs: self.add_opt(out) self._add_output(out)
[docs] def add_input_list_opt(self, opt, inputs): """ Add an option that determines a list of inputs """ self.add_opt(opt) for inp in inputs: self.add_opt(inp) self._add_input(inp)
[docs] def add_list_opt(self, opt, values): """ Add an option with a list of non-file parameters. """ self.add_opt(opt) for val in values: self.add_opt(val)
[docs] def add_input_arg(self, inp): """ Add an input as an argument """ self.add_arg(inp._dax_repr()) self._add_input(inp)
[docs] def add_output_arg(self, out): """ Add an output as an argument """ self.add_arg(out._dax_repr()) self._add_output(out)
[docs] def new_output_file_opt(self, opt, name): """ Add an option and return a new file handle """ fil = File(name) self.add_output_opt(opt, fil) return fil
# functions to describe properties of this node
[docs] def add_profile(self, namespace, key, value, force=False): """ Add profile information to this node at the DAX level """ try: entry = dax.Profile(namespace, key, value) self._dax_node.addProfile(entry) except dax.DuplicateError: if force: # Replace with the new key self._dax_node.removeProfile(entry) self._dax_node.addProfile(entry)
def _finalize(self): args = self._args + self._options self._dax_node.addArguments(*args) if len(self._raw_options): raw_args = [' '] + self._raw_options self._dax_node.addRawArguments(*raw_args)
[docs]class Workflow(object): """ """ def __init__(self, name='my_workflow'): self.name = name self._adag = dax.ADAG(name) self._inputs = [] self._outputs = [] self._executables = [] self.in_workflow = False self.sub_workflows = [] self._external_workflow_inputs = [] self.filename = self.name + '.dax' self.as_job = dax.DAX(self.filename) def _make_root_dependency(self, inp): def root_path(v): path = [v] while v.in_workflow: path += [v.in_workflow] v = v.in_workflow return path workflow_root = root_path(self) input_root = root_path(inp) for step in workflow_root: if step in input_root: common = step break dep = dax.Dependency( parent=input_root[input_root.index(common)-1].as_job, child=workflow_root[workflow_root.index(common)-1].as_job) common._adag.addDependency(dep)
[docs] def add_workflow(self, workflow): """ Add a sub-workflow to this workflow This function adds a sub-workflow of Workflow class to this workflow. Parent child relationships are determined by data dependencies Parameters ---------- workflow : Workflow instance The sub-workflow to add to this one """ workflow.in_workflow = self self.sub_workflows += [workflow] node = workflow.as_job self._adag.addJob(node) node.file.PFN(os.path.join(os.getcwd(), node.file.name), site='local') self._adag.addFile(node.file) for inp in workflow._external_workflow_inputs: workflow._make_root_dependency(inp.node) return self
[docs] def add_node(self, node): """ Add a node to this workflow This function adds nodes to the workflow. It also determines parent/child relations from the DataStorage inputs to this job. Parameters ---------- node : pycbc.workflow.pegasus_workflow.Node A node that should be executed as part of this workflow. """ node._finalize() node.in_workflow = self # Record the executable that this node uses if not node.executable.in_workflow: for exe in self._executables: if node.executable.is_same_as(exe): node.executable.in_workflow = True node._dax_node.name = exe.logical_name node.executable.logical_name = exe.logical_name break else: node.executable.in_workflow = True self._executables += [node.executable] # Add the node itself self._adag.addJob(node._dax_node) # Determine the parent child relationships based on the inputs that # this node requires. added_nodes = [] for inp in node._inputs: if inp.node is not None and inp.node.in_workflow == self: if inp.node not in added_nodes: parent = inp.node._dax_node child = node._dax_node dep = dax.Dependency(parent=parent, child=child) self._adag.addDependency(dep) added_nodes.append(inp.node) elif inp.node is not None and not inp.node.in_workflow: raise ValueError('Parents of this node must be added to the ' 'workflow first.') elif inp.node is None and not inp.workflow_input: self._inputs += [inp] inp.workflow_input = True elif inp.node is not None and inp.node.in_workflow != self and inp not in self._inputs: self._inputs += [inp] self._external_workflow_inputs += [inp] # Record the outputs that this node generates self._outputs += node._outputs return self
def __add__(self, other): if isinstance(other, Node): return self.add_node(other) elif isinstance(other, Workflow): return self.add_workflow(other) else: raise TypeError('Cannot add type %s to this workflow' % type(other))
[docs] def save(self, filename=None, tc=None): """ Write this workflow to DAX file """ if filename is None: filename = self.filename for sub in self.sub_workflows: sub.save() # FIXME this is ugly as pegasus 4.9.0 does not support the full # transformation catalog in the DAX. I have asked Karan to fix this so # that executables and containers can be specified in the DAX itself. # Karan says that XML is going away in Pegasus 5.x and so this code # will need to be re-written anyway. # # the transformation catalog is written in the same directory as the # DAX. pycbc_submit_dax needs to know this so that the right # transformation catalog is used when the DAX is planned. if tc is None: tc = '{}.tc.txt'.format(filename) p = os.path.dirname(tc) f = os.path.basename(tc) if not p: p = '.' tc = TransformationCatalog(p, f) for e in self._adag.executables.copy(): tc.add(e) try: tc.add_container(e.container) except: pass self._adag.removeExecutable(e) f = open(filename, "w") self._adag.writeXML(f) tc.write()
[docs]class DataStorage(object): """ A workflow representation of a place to store and read data from. The abstract representation of a place to store and read data from. This can include files, database, or remote connections. This object is used as a handle to pass between functions, and is used a way to logically represent the order operation on the physical data. """ def __init__(self, name): self.name = name self.node = None self.workflow_input = False def _set_as_node_input(self): pass def _set_as_node_output(self): pass def _dax_repr(self): return self.name
[docs]class File(DataStorage, dax.File): """ The workflow representation of a physical file An object that represents a file from the perspective of setting up a workflow. The file may or may not exist at the time of workflow generation. If it does, this is represented by containing a physical file name (PFN). A storage path is also available to indicate the desired final destination of this file. """ def __init__(self, name): DataStorage.__init__(self, name) dax.File.__init__(self, name) self.storage_path = None def _dax_repr(self): return self @property def dax_repr(self): """Return the dax representation of a File.""" return self._dax_repr() def _set_as_input_of(self, node): node._dax_node.uses(self, link=dax.Link.INPUT, register=False, transfer=True) def _set_as_output_of(self, node): if self.storage_path: transfer_file = True else: transfer_file = False node._dax_node.uses(self, link=dax.Link.OUTPUT, register=True, transfer=transfer_file)
[docs] def output_map_str(self): if self.storage_path: return '%s %s pool="%s"' % (self.name, self.storage_path, 'local') else: raise ValueError('This file does not have a storage path')
[docs] def has_pfn(self, url, site=None): """ Wrapper of the pegasus hasPFN function, that allows it to be called outside of specific pegasus functions. """ curr_pfn = dax.PFN(url, site) return self.hasPFN(curr_pfn)
[docs] def insert_into_dax(self, dax): dax.addFile(self)
[docs] @classmethod def from_path(cls, path): """Takes a path and returns a File object with the path as the PFN.""" urlparts = urlsplit(path) site = 'nonlocal' if (urlparts.scheme == '' or urlparts.scheme == 'file'): if os.path.isfile(urlparts.path): path = os.path.abspath(urlparts.path) path = urljoin('file:', pathname2url(path)) site = 'local' fil = File(os.path.basename(path)) fil.PFN(path, site) return fil
[docs]class Database(DataStorage): pass