Source code for xopto.mccyl.mc

# -*- coding: utf-8 -*-
################################ Begin license #################################
# Copyright (C) Laboratory of Imaging technologies,
#               Faculty of Electrical Engineering,
#               University of Ljubljana.
#
# This file is part of PyXOpto.
#
# PyXOpto is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyXOpto is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyXOpto. If not, see <https://www.gnu.org/licenses/>.
################################# End license ##################################

from typing import List, Tuple, Dict

import os
import time
import ctypes

import numpy as np
import pyopencl as cl
import jinja2

from xopto import ROOT_PATH, USER_DATA_PATH
from xopto.cl import clrng, clinfo, cloptions

from xopto.mccyl.mcobject import McObject
from xopto.mccyl import cltypes
from xopto.mccyl import mctypes
from xopto.mccyl import mclayer
from xopto.mccyl import mcsurface
from xopto.mccyl import mcdetector
from xopto.mccyl import mctrace
from xopto.mccyl import mcsv
from xopto.mccyl import mcfluence
from xopto.mccyl import mcsource
from xopto.mccyl import mcpf
from xopto.mccyl import mcprogress
from xopto.mccyl import mcoptions
from xopto.mccyl.mcutil.buffer import NumpyAllocators

from xopto import MCBASE_PATH, MCCYL_PATH
from xopto.mcbase import mcsrc
from xopto.mcbase import mcworker
from xopto.mcvox.mcfluence import FLUENCE_TYPE

MC_RESULT_TYPE = Tuple[
    mctrace.Trace or None,
    mcfluence.FLUENCE_TYPE or None,
    mcdetector.Detectors or None
]

# Instructions for fusing OpenCL source files into a single file.
_MCML_FUSION_SPEC = {
    'inputs': [
        os.path.join(MCBASE_PATH, 'kernel', 'mcbase.template.h'),
        os.path.join(MCBASE_PATH, 'kernel', 'mcbase.template.c'),
        os.path.join(MCBASE_PATH, 'kernel', 'mcsv.template.h'),
        os.path.join(MCBASE_PATH, 'kernel', 'mcsv.template.c'),
        os.path.join(MCCYL_PATH, 'kernel', 'mccyl.template.h'),
        os.path.join(MCCYL_PATH, 'kernel', 'mccyl.template.c'),
    ],
    'output': os.path.join(MCCYL_PATH, 'kernel', 'mccyl.fusion.template.h')
}

[docs]class Mc(mcworker.ClWorkerStandardBufferLutMixin, mcworker.ClWorkerRngMixin,
         mcworker.ClWorker):

    MC_CL_TEMPLATE_FILE = os.path.join(
        ROOT_PATH, 'mccyl', 'kernel', 'mc.template.h')
    ''' Location of the OpenCL source template file. '''

    AUTOGEN_PATH = os.path.join(USER_DATA_PATH, 'mccyl', 'auto')
    ''' Directory that will be used to save the rendered OpenCL source code. '''

    if not os.path.exists(AUTOGEN_PATH):
        try:
            os.makedirs(AUTOGEN_PATH, exist_ok=True)
        except OSError:
            pass

    def __init__(self, layers: mclayer.Layers or List[mclayer.Layer],
                 source: mcsource.Source,
                 detectors: mcdetector.Detectors = None,
                 trace: mctrace.Trace = None,
                 fluence: mcfluence.FLUENCE_TYPE = None,
                 surface: mcsurface.SurfaceLayouts = None,
                 types: mctypes.McDataTypesBase = mctypes.McDataTypesSingle,
                 options: List[mcoptions.McOption] = None,
                 rnginit: np.uint64 = None,
                 cl_devices: str or cl.Device or List[cl.Device] or 
                             cl.Context or cl.CommandQueue = None,
                 cl_build_options: List[str or cloptions.ClBuildOption] = None,
                 cl_profiling : bool = False):
        '''
        Cylindrical Multilayer Monte Carlo light propagation simulator object
        constructor.
        The object uses existing OpenCL kernel code in the mccyl.h and mccyl.c
        files and adds additional code and kernel configuration options
        generated by the source, detector, trace, fluence and other objects
        passed to the constructor.
        The generated OpenCL code is compiled just before the first call to
        the kernel/simulation invoked by the run method.
        All publically accessible properties of the objects
        (layers, source, detector, trace, fluence, ...) can be changed
        between simulations.

        Parameters
        ----------
        layers: mclayer.Layers or list[mclayer.Layer]
            A python list of layers. The list must contain at
            least two layers. The first layer represent the
            medium that surrounds the sample and the remaining layers represent
            the sample.
            The layers must be specified with monotonically decreasing diameter.
            Note that the diameter of the outermost (first) layer that
            represents the surrounding medium will be automatically set to
            infinity.

        source: mcsource.Source
            One of the available photon packet source object:

            - Line - Infinity thin beam.
            - UniformBeam - Collimated uniform beam.
            - GaussianBeam - Collimated beam with a Gaussian crosssection.
            - IsotropicPoint - Isotropic point source.

            See the :py:mod:`xopto.mcml.mcsource` module for further sources.

        detectors: mcdetector.Detectors
            A set of detectors for the outer sample surface and
            specular reflections.

            The Detector object will set the following simulator options:

            - MC_USE_DETECTORS
            - MC_USE_OUTER_DETECTOR
            - MC_USE_SPECULAR_DETECTOR
            - MC_USE_FP_LUT (if a lookup table is used)

        trace: mctrace.Trace
            Trace object that can collect the various states of the photon
            packet including the initial, all intermediate and/or the final.

            The Trace object will set the following simulator options:

            - MC_USE_TRACE

        fluence: mcfluence.Fluence or mcfluence.FluenceCyl or mcfluence.FluenceRz
            Fluence object that collects the energy deposited by the photon
            packets on a regular 3D grid.

            The Fluence object will set the following simulator options:

            - MC_USE_FLUENCE

        surface: mcsurface.SurfaceLayouts
            Surface geometry that is applied to the outer sample surface
            and / or internal sample layers.

            The SurfaceLayouts object will set the following simulator options
            as required:

            - MC_USE_OUTER_SURFACE_LAYOUT
            - MC_USE_INTERNAL_SURFACE_LAYOUT
            - MC_USE_SURFACE_LAYOUTS

        types: mctypes.McDataTypes
            A class that defines all the simulator data types. Use one
            of the following predefined type classes derived from
            mctypes.McDataTypes:

            - mctypes.McDataTypesSingle

                - 32-bit size type
                - 32-bit default integers,
                - 64-bit detector accumulators,
                - 32-bit single precision floating-point arithmetics,
                - 32-bit photon packet counter (maximum number of photon
                  packets per OpenCL kernel call limited to 4,294,967,295)

            - mctypes.McDataTypesDouble

                - 32-bit size type
                - 32-bit default integers,
                - 64-bit detector accumulators,
                - 64-bit double precision floating-point arithmetics,
                - 32-bit photon packet counter (maximum number of photon
                  packets per OpenCL kernel call limited to 4,294,967,295)

            - mctypes.McDataTypesSingleCnt64

                - 64-bit size type
                - 32-bit default integers,
                - 64-bit detector accumulators,
                - 32-bit single precision floating-point arithmetics,
                - 64-bit photon packet counter (maximum number of photon
                  packets per OpenCL kernel call virtually unlimited)

            - mctypes.McDataTypesDoubleCnt64

                - 64-bit size type
                - 32-bit default integers,
                - 64-bit detector accumulators,
                - 64-bit double precision floating-point arithmetics,
                - 64-bit photon packet counter (maximum number of photon
                  packets per OpenCL kernel call virtually unlimited)

            A custom combination of data types can be defined by the
            :py:meth:`mctypes.McDataTypes.custom` method.

        options: List[mcoptions.McOption]
            A list of compile-time options for the simulator core.

            - mcoptions.McUseNativeMath: mcoptions.McUseNativeMath.on or mcoptions.McUseNativeMath.off
              Use of native math functions.
              Native functions are usually faster but less accurate.
              Default value is mcoptions.McUseNativeMath.off.

            - mcoptions.McMinimumPacketWeight: mcoptions.McMinimumPacketWeight(float)
              Minimum photon packet weight
              before going through photon packet termination or lottery.
              Default value is mcoptions.McMinimumPacketWeight(1e-4).

            - mcoptions.McUseLottery: mcoptions.McUseLottery(float) 
              Terminate photon packet by lottery.
              Default value is mcoptions.McUseLottery.on.

            - mcoptions.McPacketLotteryChance: mcoptions.McPacketLotteryChance(float)
              Used if photon packets are terminated by lottery.
              If the value of a uniform random number exceeds the specified
              chance, the photon packet is terminated.
              Default value is mcoptions.McPacketLotteryChance(0.01).

            - mcoptions.McLutMemory: mcoptions.McLutMemory.global_mem or mcoptions.McLutMemory.constant_mem
            - McFpLutMemory: mcoptions.McFpLutMemory.global_mem or mcoptions.McFpLutMemory.constant_mem
              Set the OpenCL memory type for integer and floating-point
              data lookup tables that are used by some scattering phase
              functions, accumulators with custom angular sensitivity and
              photon packet sources with custom emission characteristics.
              Note that the default setting is mcoptions.McFpLutMemory.global_mem.
              Using the constant memory (mcoptions.McFpLutMemory.constant_mem)
              can be significantly (10x or more) faster, in particular on
              older GPUs. However, note that the amount of constant
              memory available on GPUs is typically limited to about 64k.
              If using lookup table-based scattering phase
              functions that can be fit into the constant memory, then
              set this option to mcoptions.McFpLutMemory.constant_mem.
              This can significantly speed up the MC simulations (~ 10x).
              To further reduce the amount of constant GPU memory required
              by the lookup table data arrays, set the scattering phase
              functions of the first and last layer (the sourrounding medium)
              to one that is used by the internal (sample) layers.

            - mcoptions.McDebugMode: mcoptions.McDebugMode.on or mcoptions.McDebugMode.off
              Enables stdout debug information on some devices.
              Default value is mcoptions.McDebugMode.off.

        rnginit: np.uint64
            OpenCL random number generator initializer as a 64-bit unsigned
            integer.
            Use this initializer if there is a need to put the random, number
            generator into a known state. 

        cl_devices: str or cl.Device or List[cl.Device] or cl.Context cl.CommnadQueue
            A python list of OpenCL devices that are used for
            conducting the simulation.See the clGpuDevices and clCpuDevices
            functions of the clinfo module for details on obtaining a
            list of OpenCl capable devices. If None is provided, the
            first available device is used (GPU devices have priority over
            CPU devices). Use function clinfo.device to get a desired
            device by simple keywords, e.g. a call
            clinfo.device(['amd', 'nvidia', 'hd', 'cpu'], that will search
            for an AMD GPU, Nvidia GPU, Intel Hd GPU, any CPU and return
            the first device found.
            The value of this input argument can also be an
            instance of OpenCL Context or an instance of OpenCL CommandQueue.
            Note that in case an instance of CommandQueue is passed, the value
            of parameter cl_profiling is ignored since it is not possible to
            enable or disable profiling on an existing OpenCL CommandQueue.

        cl_build_options: List[str]
            A list of OpenCL build option as specified by the OpenCl manuals at
            https://www.khronos.org/.
            An example of commonly used build options:

            .. code-block:: python

                cloptions=['-cl-opt-disable', '-Werror',
                           '-cl-fast-relaxed-math', '-cl-mad-enable']

            Note that the "-cl-fast-relaxed-math" build option will likely
            lead to a significant performance improvement but may on some
            platforms lead to errors in the simulated quantities.

        cl_profiling: bool
            Enables OpenCL command queue profiling if set to True.
            Note that in case an instance of CommandQueue is passed as the
            cl_devices parameter, the value of parameter cl_profiling is
            ignored since it is not possible to enable or disable profiling
            on an existing OpenCL CommandQueue.

        Examples
        --------
        See the test/example.py for additional examples.

        >>> source = mcsource.GaussianBeam(0.001, 0.22)
        >>> l0 = mclayer.Layer(d=0.000, n=1.0, mua=0.0,  mus=0.0,   pf=Hg(0.5))
        >>> l1 = mclayer.Layer(d=0.001, n=1.3, mua=1e2,  mus=100e2, pf=Hg(0.8))
        >>> l2 = mclayer.Layer(d=0.010, n=1.3, mua=10e2, mus=30e2,  pf=Hg(0.7))
        >>> l3 = mclayer.Layer(d=0.000, n=1.0, mua=0.0,  mus=0.0,   pf=Hg(0.5))
        >>> layers = mclayer.Layers([l0, l1, l2, l3])
        >>> trace = mctrace.Trace(maxlen=1000, options=mctrace.Trace.TRACE_ALL)
        >>> mc = Mc(layers, source, trace)
        >>> trace_data, _, _ = mc.run(100)
        '''
        super().__init__(types=types, options=options, cl_devices=cl_devices,
                         cl_build_options=cl_build_options,
                         cl_profiling=cl_profiling, rnginit=rnginit)
        # Initialization of members for later use.
        self._cl_exec = self._cl_src = self._cl_src_options = None

        # the latest kernel timing report and the number of used threads
        self._run_report = {'upload': None, 'download': None,
                            'execution': None, 'threads': None,
                            'items': None}

        # Size of structures on the target OpenCL device
        self._sizeof_types = {}

        # Data types of the simulator objects.
        self._mc_obj_types = {'pf': None, 'source': None, 'detectors': None,
                              'fluence': None, 'trace': None,
                              'surface_layouts': None}

        # Prepare the sample layer stack.
        self._layers = mclayer.Layers(layers)
        self._layers.check()
        # Save the type of scattering phase function.
        self._mc_obj_types['pf'] = type(self._layers[0].pf)

        # Photon packet source.
        self._source = source
        self._mc_obj_types['source'] = type(source)

        # Packed buffers of simulator objects ready for use/transfer to the
        # OpenCL device. 
        self._packed = {'source': None,
                        'surface_layouts': None,
                        'layers': None, 'detectors': None,
                        'trace': None, 'fluence': None}

        # Sample surface reflectance/transmittance object
        self._detectors = detectors
        if self._detectors is not None:
            self._mc_obj_types['detectors'] = \
                type(self._detectors), self._detectors.types()

        # Photon packet trace object.
        self._trace = trace
        if self._trace is not None:
            self._mc_obj_types['trace'] = type(self._trace)

        # Fluence object.
        self._fluence = fluence
        if self._fluence is not None:
            self._mc_obj_types['fluence'] = type(self._fluence)

        # Surface geometry object
        self._surface_layouts = surface
        if self._surface_layouts is not None:
            self._mc_obj_types['surface_layouts'] = \
                    type(self._surface_layouts), self._surface_layouts.types()

        # Monte Carlo simulator options.
        if options is None:
            options = []
        self._mc_options = options

        # Default maximum simulation radius used by the Monte Carlo kernel.
        self._rmax = float('inf')

[docs]    def layer(self, index:int) -> mclayer.Layer:
        '''
        Returns the layer at the specified index.
        Layers can be also accessed by the :py:attr:`~xopto.mccyl.mc.Mc.layers`
        property and [] operator.

        Parameters
        ----------
        index: int
            Index of the layer.

        Returns
        -------
        layer: xopto.mccyl.mclayer.layer.Layer
            Layer at the specified index.
        '''
        return self._layers.layer(index)

[docs]    def layer_index(self, r: float) -> int:
        '''
        Returns the index of the layer that contains the specified r coordinate.

        Parameters
        ----------
        r: float
            R coordinate for which to find the layer index.

        Returns
        -------
        index: int
            Layer index that contains the specified r coordinate.
        '''
        return self._layers.layer_index(r)

[docs]    def append_r_lut(self, data: np.ndarray, force: bool = False) \
            -> mcworker.LutEntry:
        '''
        Append a read-only lookup table data array to the list of managed
        lookup table arrays.

        Parameters
        ----------
        data: np.ndarray
            Lookup table data to be added to the managed list. The numpy
            array must be of integer or floating-point type.
        force: bool
            Add the array to the managed list even if an existing entry for the
            given data array is found.

        Returns
        -------
        lutetry: LutEntry
            Lookup table entry. Use the offset property to locate the
            first element of the entry in the lookup table buffer.
        '''
        kind = data.dtype.kind
        if kind in ('i', 'u'):
            # integer data
            lutetry, new_item = self.append_r_int_lut(data, force=force)
        elif kind == 'f':
            # floating point data
            lutetry, new_item = self.append_r_float_lut(data, force=force)
        else:
            raise TypeError('Lookup tables only support integer and '
                            'floating-point data types!')

        return lutetry

[docs]    def clear_r_luts(self):
        '''
        Clear all the read-only lookup table managers
        '''
        self.clear_r_float_lut()
        self.clear_r_int_lut()

    def _pack(self, nphotons: int):
        '''
        Pack the data into buffers.

        Parameters
        ----------
        nphotons: int
            The number of photon packets that will be run by the simulator.
        '''
        # Clear existing allocations of detector and fluence.
        self.cl_rw_accumulator_allocator.clear()
        self.cl_rw_float_allocator.clear()
        self.cl_rw_int_allocator.clear()

        # Clear the lookup table managers
        self.clear_r_luts()

        # Finalize and pack the sample layer stack.
        self._packed['layers'] = self._layers.cl_pack(self, self._packed['layers'])

        if self._mc_obj_types['pf'] != type(self._layers[1].pf):
            raise ValueError('The scattering phase function kind/type must not' \
                             'change simulation calls!')

        # Finalize and pack the photon packet source.
        self._packed['source'], _, _ = \
                self._source.cl_pack(self, self._packed['source'])

        if self._mc_obj_types['source'] != type(self._source):
            raise ValueError('The photon packet source type/kind must not' \
                             'change between simulation calls!')

        # pack the detector configuration
        if self._detectors is not None:
            self._packed['detectors'] = self._detectors.cl_pack(
                self, self._packed['detectors'])

            if self._mc_obj_types['detectors'] != \
                    (type(self._detectors), self._detectors.types()):
                raise ValueError(
                    'Detector types must not change between simulation calls!')

        # Finalize and pack the fluence configuration data.
        if self._fluence is not None:
            self._packed['fluence'] = self._fluence.cl_pack(
                self, self._packed['fluence'])

        # Finalize and pack the photon packet trace configuration data.
        if self._trace is not None:
            self._packed['trace'] = self._trace.cl_pack(
                self, self._packed['trace'], nphotons)

        if self._surface_layouts:
            self._packed['surface_layouts'] = self._surface_layouts.cl_pack(
                self, self._packed['surface_layouts'])

            if self._mc_obj_types['surface_layouts'] != \
                    (type(self._surface_layouts), self._surface_layouts.types()):
                raise ValueError(
                    'Surface layout types must not change between simulation calls!')

    def _build_src(self, verbose=False):
        '''
        Generate and build the OpenCL source from a template.
        '''
        # Load the OpenCL kernel template from a file
        #with open(Mc.MC_CL_TEMPLATE_FILE, 'rt') as fp:
        #    template = jinja2.Template(fp.read())
        template = jinja2.Template(mcsrc.fuse(**_MCML_FUSION_SPEC, verbose=verbose))

        context = {'source':{}, 'trace':{}, 'fluence':{}, 'detectors': {},
                   'surface_layouts':{}, 'mc':{}}

        # Collect the photon packet source type compile context.
        context['source'] = {
            'options': self._source.fetch_cl_options(self),
            'declaration': self._source.fetch_cl_declaration(self),
            'implementation': self._source.fetch_cl_implementation(self)
        }

        # Collect the layer stack and scattering phase function compile context.
        context['pf'] = {
            'options': self._layers.fetch_cl_options(self),
            'declaration': self._layers.fetch_cl_declaration(self),
            'implementation': self._layers.fetch_cl_implementation(self),
        }

        # Collect the photon packet trace type compile context.
        if self._trace is not None:
            context['trace'] = {
                'options': self._trace.fetch_cl_options(self),
                'declaration': self._trace.fetch_cl_declaration(self),
                'implementation': self._trace.fetch_cl_implementation(self)
            }

        # Collect the fluence type compile context.
        if self._fluence is not None:
            context['fluence'] = {
                'options': self._fluence.fetch_cl_options(self),
                'declaration': self._fluence.fetch_cl_declaration(self),
                'implementation': self._fluence.fetch_cl_implementation(self)
            }

        # Collect the surface detector type build context.
        if self._detectors is not None:
            context['detectors'] = {
                'options': self._detectors.fetch_cl_options(self),
                'declaration': self._detectors.fetch_cl_declaration(self),
                'implementation': self._detectors.fetch_cl_implementation(self)
            }

        # Collect the top and bottom sample surface geometry.
        if self._surface_layouts is not None:
            context['surface_layouts'] = {
                'options': self._surface_layouts.fetch_cl_options(self),
                'declaration': self._surface_layouts.fetch_cl_declaration(self),
                'implementation': self._surface_layouts.fetch_cl_implementation(self)
            }

        mc_options = []
        for item in self._mc_options:
            if isinstance(item, mcoptions.McOption):
                mc_options.extend(item.fetch_cl_options(self))
            elif isinstance(item, (tuple, list)):
                mc_options.append(tuple(item))
            else:
                raise ValueError(
                    'Simulator options must be a list of McOption or a list '
                    'of tuples!')

        # Resolve/join all the simulator options.
        resolved_options = mcoptions.resolve_cl_options(
            self._types().fetch_cl_options(self),
            context['source'].get('options', []),
            context['pf'].get('options', []),
            context['detectors'].get('options', []),
            context['fluence'].get('options', []),
            context['trace'].get('options', []),
            context['surface_layouts'].get('options', []),
            mc_options
        )
        # Convert the resolved simulator options to defines.
        context['mc'] = {'options': mcoptions.make_defines(resolved_options)}
        # save all the kernel options for later use
        self._cl_src_options = dict(resolved_options)

        # Render the Monte Carlo OpenCL source file.
        self._cl_src = template.render(context)

[docs]    def export_src(self, filename: str = None) -> str:
        '''
        Export the OpenCL source to a file.

        Parameters
        ----------
        filename: str
            The OpenCl source code will be saved to this file. If a file
            name is not provided, the OpenCL source will be saved to
            :py:attr:`Mc.AUTOGEN_PATH` directory with a name
            "mcml_<random>.cl"

        Returns
        -------
        filename: str
            The filename that was used to save the OpenCL source code.
        '''
        if filename is None:
            suffix = str(time.perf_counter()).replace('.', '')
            filename = os.path.join(Mc.AUTOGEN_PATH, 'mcml_{}.cl'.format(suffix))
        with open(filename, 'wt') as fp:
            fp.write(self._cl_src)

    def _build_exec(self, exportsrc=None, verbose=False):
        '''
        Build the OpenCL executable.
        '''
        # Rebuild from source if required - first call of the run method.
        tb = time.perf_counter()
        # Generate / render the source code for this simulator instance.
        self._build_src(verbose=verbose)
        if exportsrc is not None and exportsrc:
            filename = None
            if isinstance(exportsrc, str):
                filename = exportsrc
            self.export_src(filename)
        # Compile the OpenCL source code.
        self._cl_exec = self.cl_build(self._cl_src, verbose)
        # options=['-cl-opt-disable', '-Werror']
        buildtime = time.perf_counter() - tb

        if verbose:
            print('OpenCL engine created in {:.3f} ms.'.format(buildtime*1e3))

        return buildtime

    def _fetch_typesizes(self, validate=True):
        '''
        Fetch the size of data types / structures on the target OpenCL device.
        '''
        np_sizes = np.zeros(10, dtype=np.uint32)
        cl_type_sizes = self.cl_rw_buffer('cl_type_sizes', np_sizes)
        self._cl_exec.sizeof_datatypes(
            self.cl_queue, (1,), None, cl_type_sizes).wait()
        cl.enqueue_copy(self.cl_queue, np_sizes, cl_type_sizes).wait()

        siz = self._sizeof_types = dict(
            zip(('layer', 'source', 'surface_layouts',
                 'detectors', 'trace', 'fluence'), np_sizes))

        if validate:
            n = cltypes.sizeof(self.layers[0].fetch_cl_type(self))
            if siz['layer'] != n:
                raise RuntimeError(
                    'The host and device size of the McLayer structure '
                    'are not the same!'.format(n, siz['layer']))

            n = cltypes.sizeof(self.source.fetch_cl_type(self))
            if siz['source'] != n:
                raise RuntimeError(
                    'The host ({}) and device ({}) size of the McSource '
                    'structure are not the same!'.format(n, siz['source']))

            if self.detectors is not None:
                n = cltypes.sizeof(self.detectors.fetch_cl_type(self))
                if siz['detectors'] != n:
                    raise RuntimeError(
                        'The host ({}) and device ({}) size of the McDetectors '
                        'structure are not the same!'.format(n, siz['detectors']))

            if self.trace is not None:
                n = cltypes.sizeof(self.trace.fetch_cl_type(self))
                if siz['trace'] != n:
                    raise RuntimeError(
                        'The host ({}) and device ({}) size of the McTrace '
                        'structure are not the same!'.format(n, siz['trace']))

            if self.fluence is not None:
                n = cltypes.sizeof(self.fluence.fetch_cl_type(self))
                if siz['fluence'] != n:
                    raise RuntimeError(
                        'The host ({}) and device ({}) size of the McFluence '
                        'structure are not the same!'.format(n, siz['fluence']))

            if self.surface is not None:
                n = cltypes.sizeof(self.surface.fetch_cl_type(self))
                if siz['surface_layouts'] != n:
                    raise RuntimeError(
                        'The host ({}) and device ({}) size of the '
                        'McSurfaceLayouts structure are not the same!'.format(
                            n, siz['surface_layouts']))

[docs]    def run(self, nphotons: int,
            out: MC_RESULT_TYPE or None = None,
            wgsize: int = None, maxthreads: int = None,
            copyseeds: bool = False, exportsrc: bool = None,
            verbose: bool = False) -> MC_RESULT_TYPE:
        '''
        Run a simulation with the specified number of photon packets.

        Parameters
        ----------
        nphotons: int
            Number of photon packets to simulate. Note that the maximum number
            of photon packets is limited to :py:attr:`Mc.types.mc_cnt_max`.
        out: MC_RESULT_TYPE or None
            Existing trace, fluence and/or detector objects that will be
            updated with the simulation results if not None. Pass objects
            that were returned after the previous/first call of the
            :py:meth:`run` method.
            Do NOT pass any of the objects that were used as input arguments
            of the :py:meth:`Mc` constructor!.
        wgsize: int
            Local work group size. If None (default), the optimal work group
            size is determined automatically.
        maxthreads: int
            Total number of threads to use. Limited by the
            number of available random number generators (150,000).
        copyseeds: bool
            If True, copy the mutable seeds of the random number generator
            from the OpenCL kernel to the host. This is only required if the
            seeds need to be reused or inspected (for debug purposes).
        exportsrc: str or bool
            Optional location to save the OpenCL source code. The source file
            is saved only on the first run of the simulator.
        verbose: bool
            If set to True, additional debug information is displayed.

        Returns
        -------
        trace: mctrace.Trace
            Trace object instance with simulation results or None if not used
            by the simulator.
        fluence: mcfluence.Fluence or mcfluence.FluenceCyl or mcfluence.FluenceRz
            Fluence object instance with simulation results or None if not used
            by the simulator.
        detector: mcdetector.Detectors
            Detectors object instance with simulation results or None if not
            used by the simulator.

        Note
        ----
        The returned :py:class:`mctrace.Trace`,
        :py:class:`mcfluence.Fluence` and :py:class:`mcdetector.Detector`
        objects are not the same instances as were passed to the
        constructor :py:meth:`Mc`. A new set of instances is created
        for the results of each run of the simulator.

        The maximum number of photon packets that can be simulated in a
        single run is limited by the kernel data type used as the photon
        packet counter. The maximum value is limited to
        :py:attr:`Mc.types.mc_cnt_max`.
        '''
        if out is None:
            out = (None, None, None)

        nphotons = int(nphotons)

        # finalize and pack the configuration data
        self._pack(nphotons)

        # shortcut to memory flags
        mf = cl.mem_flags
        # predefined combinations of memory access flags
        mf_rw = mf.READ_WRITE
        mf_r = mf.READ_ONLY
        mf_w = mf.WRITE_ONLY
        mf_cp_host = mf.COPY_HOST_PTR

        # shortcuts to buffers
        cl_buffers = self.cl_buffers
        np_buffers = self.np_buffers
        packed = self._packed

        # check if the maximum number of photon packets is not exceeded
        if nphotons > self._types.mc_cnt_max:
            raise ValueError(
                'Maximum number of photon packets that can be '
                'simulated in a single run is limited to {:,d}!'.format(
                    self._types.mc_cnt_max)
            )

        # start timing this run
        t0 = time.perf_counter()

        if self._cl_exec is None:
            # Rebuild from source if required - first call of the run method.
            self._build_exec(exportsrc=exportsrc, verbose=verbose)

            # fetch the data type sizes on the OpenCL device and validate /
            # compare to the size of the data types on the host
            self._fetch_typesizes(validate=True)

            # allocate a global counter of processed photon packets
            np_buffers['num_processed_packets'] = \
                np.zeros([1], dtype=self._types.np_cnt)

            # allocate a counter of OpenCL kernels
            np_buffers['num_kernels'] = np.zeros([1], dtype=np.uint32)

            # allocate and initialize an OpenCL buffer with the mutable seeds
            # of the random number generator
            self.cl_rw_buffer('rng_seeds_x', self.rng_seeds_x)

            # allocate and initialize an OpenCL buffer with the immutable seeds
            # of the random number generator
            self.cl_r_buffer('rng_seeds_a', self.rng_seeds_a)

        data = np_buffers['num_processed_packets']
        data[0] = 0
        self.cl_rw_buffer('num_processed_packets', data)
        
        # initialize the number of used OpenCL kernels to 0
        data = np_buffers['num_kernels']
        data[0] = 0
        self.cl_rw_buffer('num_kernels', data)

        # allocate an OpenCL buffer for the sample layer stack
        self.cl_r_buffer('layers', packed['layers'])

        # allocate an OpenCL buffer for the photon packet source
        self.cl_r_buffer('source', packed['source'])

        # allocate and initialize an OpenCL buffer for the surface layouts
        self.cl_r_buffer('surface_layouts', packed['surface_layouts'], size=8)

        # allocate and intitalize an OpenCL buffer for the trace
        self.cl_r_buffer('trace', packed.get('trace'), size=4)

        # allocate and intitalize an OpenCL buffer for the fluence
        self.cl_r_buffer('fluence', packed.get('fluence'), size=4)

        # allocate and intitalize an OpenCL buffer for the surface detector
        self.cl_r_buffer('detectors', packed.get('detectors'), size=4)

        # allocate and initialize an OpenCL buffer for the floating-point
        # lookup-table
        self.cl_r_float_lut()

        # allocate and initialize the read-write accumulator type buffer
        self.cl_rw_accumulator_buffer(fill=0)

        # allocate and intitalize the read-write floating-point type buffer
        self.cl_rw_float_buffer(fill=0.0)

        # allocate and intitalize the read-write integer type buffer
        self.cl_rw_int_buffer(fill=0)

        # Check if floating-point lookup tables are properly used.
        if len(self.float_r_lut_manager) and 'MC_USE_FP_LUT' not in self._cl_src_options:
            raise ValueError(
                'Floating-point lookup tables are used by one or more modules '
                'but the compile-time kernel option MC_USE_FP_LUT was not set '
                'to True!')

        # Check if integer lookup tables are properly used.
        if len(self.int_r_lut_manager) and 'MC_USE_INT_LUT' not in self._cl_src_options:
            raise ValueError(
                'Integer lookup tables are used by one or more modules but '
                'the compile-time kernel option MC_USE_INT_LUT was not '
                'set to True!')

        cl_queue = self.cl_queue

        # limit the maximum number of threads that can be concurrently run on
        # the OpenCL device - limited by the available number of rng seeds
        if maxthreads is None:
            maxthreads = self.cl_max_threads
        else:
            maxthreads = min(maxthreads, self.cl_max_threads)

        # configure the local and global work group size
        if wgsize is None:
            localWG = None
            globalWG = [maxthreads]
        else:
            wgsize = int(wgsize)
            localWG = [wgsize]
            globalWG = [(maxthreads//wgsize)*wgsize]

        # start timing the kernel call
        t1 = time.perf_counter()

        # call the kernel
        self._cl_exec.McKernel(
            cl_queue, globalWG, localWG,

            np.dtype(self._types.np_cnt).type(nphotons),
            cl_buffers['num_processed_packets'],

            cl_buffers['num_kernels'],

            np.dtype(self._types.np_float).type(self._rmax),

            cl_buffers['rng_seeds_x'],
            cl_buffers['rng_seeds_a'],

            np.uint32(len(self._layers)),
            cl_buffers['layers'],

            cl_buffers['source'],

            cl_buffers['surface_layouts'],

            cl_buffers['trace'],

            cl_buffers['fluence'],

            cl_buffers['detectors'],

            self.cl_r_float_lut(False),

            self.cl_rw_int_buffer(),
            self.cl_rw_float_buffer(),
            self.cl_rw_accumulator_buffer(),
        ).wait()

        t2 = time.perf_counter()

        # get the number of concurrent kernels that were used to process the job
        cl.enqueue_copy(
            cl_queue,
            np_buffers['num_kernels'], cl_buffers['num_kernels']).wait()

        # copy seeds from the kernel - actually not required but good for debug
        if copyseeds:
            cl.enqueue_copy(
                cl_queue,
                self.rng_seeds_x, cl_buffers['rng_seeds_x']).wait()

        # download the trace data and update the trace object
        trace_res = out[0]
        if self._trace is not None:
            with self.np_allocators as np_allocators:
                all_data = self._download_allocated_buffers(
                    self._trace, np_allocators, nphotons)
                if all_data:
                    if trace_res is None:
                        trace_res = type(self._trace)(self._trace)
                    trace_res.update_data(self, all_data, nphotons=nphotons)

        # download the fluence data and update the fluence object
        fluence_res = out[1]
        if self._fluence is not None:
            with self.np_allocators as np_allocators:
                all_data = self._download_allocated_buffers(
                    self._fluence, np_allocators, nphotons)
                if all_data:
                    if fluence_res is None:
                        fluence_res = type(self._fluence)(self._fluence)
                    fluence_res.update_data(self, all_data, nphotons=nphotons)

        # download the detector data and update the detectors
        detectors_res = out[2]
        if self._detectors is not None:
            for detector in self._detectors:
                with self.np_allocators as np_allocators:
                    all_data = self._download_allocated_buffers(
                        detector, np_allocators, nphotons)
                    if all_data:
                        if detectors_res is None:
                            detectors_res = type(self._detectors)(self._detectors)
                        detectors_res.update_data(
                            self, detector, all_data, nphotons=nphotons)

        uploadtime = t1 - t0
        exectime = t2 - t1
        downloadtime = time.perf_counter() - t2
        self._run_report.update(upload=uploadtime, download=downloadtime,
                                execution=exectime, items=nphotons,
                                threads=int(np_buffers['num_kernels'][0]))
        if verbose:
            print('McKernel processed {:,d} packets in {:,d} threads:\n'
                  '    uploaded/built in: {:10.3f} ms\n'
                  '    executed in      : {:10.3f} ms\n'
                  '    downloaded in    : {:10.3f} ms'.format(
                       nphotons, int(np_buffers['num_kernels'][0]),
                       uploadtime*1e3, exectime*1e3, downloadtime*1e3)
            )

        return trace_res, fluence_res, detectors_res

    def _download_allocated_buffers(
            self, owner: any, np_allocators: NumpyAllocators, nphotons) -> \
                Dict[np.dtype, List[np.array]]:
        all_data = {}
        for allocator in self.cl_rw_allocators:
            for allocation in allocator.allocations(owner):
                if allocation.download:
                    buffer = None
                    if hasattr(owner, 'np_buffer'):
                        buffer = owner.np_buffer(
                            self, allocation, nphotons=nphotons)
                    if buffer is None:
                        buffer = np_allocators.allocate(
                            allocation.dtype, allocation.size)
                    self.cl_allocation_download(allocation, buffer)
                    data_of_type = all_data.get(allocation.dtype, [])
                    data_of_type.append(buffer)
                    all_data[allocation.dtype] = data_of_type
        return all_data

[docs]    def sampling_volume(self, trace: mctrace.Trace, sv: mcsv.SamplingVolume,
                        wgsize: int = None, maxthreads: int = None,
                        exportsrc: bool = None, verbose: bool = False) \
                            -> mcsv.SamplingVolume:
        '''
        Compute the sampling volume from the given simulation trace.

        Parameters
        ----------
        trace: mctrace.Trace
            Simulation trace object to process.
        sv: mcsv.SamplingVolume
            Target sampling volume that will be updated with the results.
        wgsize: int
            Local work group size. If None (default), the optimal work group
            size is determined automatically.
        maxthreads: int
            Total number of parallel threads to use. Limited by the
            number of available random number generators (150,000).
        exportsrc: str or bool
            Optional location to save the OpenCL source code. The source file
            is saved only on the first run of the simulator.
        verbose: bool
            If set to True, additional debug information is displayed.

        Returns
        -------
        sv: mcsv.SamplingVolume
            Input sampling volume updated with the data from the trace.
        '''
        if self._trace is None:
            raise RuntimeError(
                'This Monte Carlo simulator was build without the Trace '
                'functionallity (the trace argument of :py:meth:Mc.__init__) '
                'was None! Sampling volume analysis requires trace '
                'functionality!')

        if self._cl_exec is None:
            # Rebuild from source if required - first call of the run method.
            self._build_exec(exportsrc=exportsrc, verbose=verbose)

        t0 = time.perf_counter()

        nphotons = trace.nphotons

        # allocate some numpy buffers
        if self.np_buffers.get('num_processed_packets') is None:
            self.np_buffers['num_processed_packets'] = np.zeros((1,),
                dtype=self.types.np_cnt)
        if self.np_buffers.get('num_kernels') is None:
            self.np_buffers['num_kernels'] = np.zeros((1,),
                dtype=np.uint32)
        if self.np_buffers.get('sv_total_weight') is None:
            self.np_buffers['sv_total_weight'] = np.zeros((1,),
                dtype=self.types.np_accu)

        # Clear existing allocations of detector and fluence.
        self.cl_rw_accumulator_allocator.clear()
        self.cl_rw_float_allocator.clear()
        self.cl_rw_int_allocator.clear()

        data = self.np_buffers['num_processed_packets']
        data.fill(0)
        self.cl_rw_buffer('num_processed_packets', data)

        # initialize the number of used OpenCL kernels to 0
        data = self.np_buffers['num_kernels']
        data.fill(0)
        self.cl_rw_buffer('num_kernels', data)

        # initialize the total weight of photon packets
        data = self.np_buffers['sv_total_weight']
        data.fill(0)
        self.cl_rw_buffer('sv_total_weight', data)

        # Pack the data for OpenCL
        packed = self._packed

        packed['sv_trace'] = trace.cl_pack(
            self, packed.get('sv_trace'), nphotons=nphotons)
        packed['sv'] = sv.cl_pack(self, packed.get('sv'))

        cl_sv_trace = self.cl_r_buffer('sv_trace', packed['sv_trace'])
        cl_sv = self.cl_r_buffer('sv', packed['sv'])

        # allocate and initialize the read-write accumulator buffer
        self.cl_rw_accumulator_buffer(fill=0)
        # allocate and intitalize the read-write floating-point buffer
        self.cl_rw_float_buffer()
        # allocate and intitalize the read-write integer buffer
        self.cl_rw_int_buffer()

        # upload the trace data
        trace_len = self.cl_rw_int_allocator.allocations(trace)[0]
        self.cl_allocation_upload(trace_len, trace.n_to_cl(self))
        trace_data = self.cl_rw_float_allocator.allocations(trace)[0]
        self.cl_allocation_upload(trace_data, trace.data_to_cl(self))

        # limit the maximum number of threads that can be concurrently run on
        # the OpenCL device - limited by the available number of rng seeds
        if maxthreads is None:
            maxthreads = self.cl_max_threads
        else:
            maxthreads = min(maxthreads, self.cl_max_threads)

        # configure the local and global work group size
        if wgsize is None:
            localWG = None
            globalWG = [maxthreads]
        else:
            wgsize = int(wgsize)
            localWG = [wgsize]
            globalWG = [(maxthreads//wgsize)*wgsize]

        t1 = time.perf_counter()

        self._cl_exec.SamplingVolume(
            self.cl_queue, globalWG, localWG,

            np.dtype(self._types.np_cnt).type(nphotons),
            self.cl_buffers['num_processed_packets'],
            self.cl_buffers['num_kernels'],

            cl_sv_trace,
            cl_sv,

            self.cl_buffers['sv_total_weight'],

            self.cl_rw_int_buffer(),
            self.cl_rw_float_buffer(),
            self.cl_rw_accumulator_buffer()
        ).wait()

        t2 = time.perf_counter()

        # Get the number of concurrent kernels that were used to process the job.
        cl.enqueue_copy(
            self.cl_queue,
            self.np_buffers['num_kernels'],
            self.cl_buffers['num_kernels']).wait()

        cl.enqueue_copy(
            self.cl_queue,
            self.np_buffers['sv_total_weight'],
            self.cl_buffers['sv_total_weight']).wait()

        accus = []
        sv_out = sv
        # sv_out = type(sv)(sv)
        with self.np_allocators as np_allocator:
            for allocation in self.cl_rw_accumulator_allocator.allocations(sv):
                buffer = np_allocator.allocate(
                    self.types.np_accu, allocation.size)
                accus.append(self.cl_allocation_download(allocation, buffer))

            if accus:
                sv_out.update_data(
                    self, accumulators=accus, nphotons=nphotons,
                    total_weight=self.np_buffers['sv_total_weight'])

                uploadtime = t1 - t0
        exectime = t2 - t1
        downloadtime = time.perf_counter() - t2

        self._run_report.update(upload=uploadtime, download=downloadtime,
                                execution=exectime, items=nphotons,
                                threads=int(self.np_buffers['num_kernels'][0]))

        if verbose:
            print('SamplingVolume processed {:d} packets in {:d} threads:\n'
                  '    uploaded/built in: {:10.3f} ms\n'
                  '    executed in      : {:10.3f} ms\n'
                  '    downloaded in    : {:10.3f} ms'.format(
                       nphotons, int(self.np_buffers['num_kernels'][0]),
                       uploadtime*1e3, exectime*1e3, downloadtime*1e3)
            )

        return sv_out

    def _get_types(self) -> mctypes.McDataTypes:
        return self._types
    types = property(_get_types, None, None, 'Simulator data types.')

    def _get_rmax(self) -> float:
        return self._rmax
    def _set_rmax(self, rmax: float):
        rmax = float(rmax)
        if rmax == 0.0:
            raise ValueError('Simulation radius must be > 0.0 m!')
        self._rmax = rmax
    rmax = property(_get_rmax, _set_rmax, None,
                    'Maximum simulation radius measured from '\
                    'the photon packet source position (m).')

    def _get_cl_exec(self) -> cl.Program:
        return self._cl_exec
    cl_exec = property(_get_cl_exec, None, None,
                       'Compiled OpenCL code / executable with kernels.')

    def _get_layers(self) -> mclayer.Layers:
        return self._layers
    def _set_layers(self, layers: mclayer.Layers or List[mclayer.Layer]):
        self._layers = mclayer.Layers(layers)
    layers = property(_get_layers, _set_layers, None, 'Sample layer stack.')

    def _get_source(self) -> mcsource.Source:
        return self._source
    source = property(_get_source, None, None, 'Photon packet source object.')

    def _get_detectors(self) -> mcdetector.Detectors:
        return self._detectors
    detectors = property(_get_detectors, None, None,
                         'Detectors object if available.')

    def _get_fluence(self) -> mcfluence.Fluence:
        return self._fluence
    fluence = property(_get_fluence, None, None,
                       'Fluence object if available.')

    def _get_trace(self) -> mctrace.Trace:
        return self._trace
    trace = property(_get_trace, None, None, 'Trace object if available.')

    def _get_surface(self) -> mcsurface.SurfaceLayouts:
        return self._surface_layouts
    surface = property(_get_surface, None, None, 'Sample surface layout.')

    def _get_run_report(self) -> dict:
        return self._run_report
    run_report = property(_get_run_report, None, None,
                          'Timing and performance data of the latest kernel '
                          'run as a dict with keys "upload", "download", '
                          '"execution", "threads" and "items". '
                          'The timing values with keys "upload", "download" '
                          'and "execution" are given in seconds. '
                          'The number of OpenCL threads with key "threads" '
                          'and the number of processed items with key "items" '
                          'are integer values. '
                          'All values are initialized to None.')

    ############################# Testing code ###########################
[docs]    def dbg(self):
        data = self._npDbg
        res = {}
        res['nPhotons'] = int(data[0])
        res['nPhotonsDone'] = int(data[1])
        res['nLayers'] = int(data[2])
        res['layers'] = []
        pos = 3
        for l in range(res['nLayers']):
            layer = {}
            layer['thickness'] = data[pos]
            layer['top'] = data[pos + 1]
            layer['bottom'] = data[pos + 2]
            layer['n'] = data[pos + 3]
            layer['cosCriticalTop'] = data[pos + 4]
            layer['cosCriticalBottom'] = data[pos + 5]
            layer['muS'] = data[pos + 6]
            layer['muA'] = data[pos + 7]
            layer['muAS'] = data[pos + 8]
            layer['invMuAS'] = data[pos + 9]
            layer['muAinvMuAS'] = data[pos + 10]
            layer['pfparams'] = data[pos + 11: pos + 16]
            res['layers'].append(layer)
            pos += 16
        res['source'] = {'origin':data[pos:pos + 3], 'layer':data[pos + 3],
                         'params':data[pos + 4: pos + 8]}

        res['fluenceStride'] = data[pos + 8: pos + 11]
        res['fluenceTopLeft'] = data[pos + 11: pos + 14]
        res['fluenceInvVoxelSize'] = data[pos + 14: pos + 17]

        res['rtStride'] = data[pos + 17: pos + 19]
        res['rtTopLeft'] = data[pos + 19: pos + 21]
        res['rtInvPixelSize'] = data[pos + 21: pos + 23]

        res['rtAcceptanceCosMin'] = data[pos + 23: pos + 25]

        return res

[docs]    def rng_test(self, n: int, x: int = None, a: int = None) -> np.ndarray:
        '''
        Generate random numbers on the OpenCL device. The random number
        generator is the same as used by the Monte Carlo kernel. The
        precision is inherited from the Simulator.
        The states / seeds x and a can be optionally generated with the 
        :py:attr:`rng` property that is an instance of
        :py:class:`xopto.cl.clrng.Random`.

        Parameters
        ----------
        n: int
            The number of random numbers to generate.
        x: np.uint64
            Mutable state of the random number generator.
        a: np.uint32
            Immutable state of the random number generator.

        Returns
        -------
        data: np.ndarray
            A vector of n random numbers.
        '''
        if x is None or a is None:
            x, a = self.rng.seeds(1)

        n = int(n)

        mf = cl.mem_flags
        cl_buffer = cl.Buffer(self._cl_context, mf.WRITE_ONLY,
                              np.dtype(self.types.np_float).itemsize*n)

        self._cl_exec.RngKernel(
            self._cl_queue, [1], None,
            np.uint64(x), np.uint32(a),
            np.dtype(self._types.np_size).type(n),
            cl_buffer
        ).wait()

        result = np.zeros((n,), dtype=self.types.np_float)
        cl.enqueue_copy(self._cl_queue, result, cl_buffer).wait()

        return result

[docs]    def testMcSinCos(self, x):
        mf = cl.mem_flags

        x = np.asarray(x, dtype=self._fptype)

        cl_x = cl.Buffer(self._cl_context, mf.READ_ONLY |
                         mf.COPY_HOST_PTR, hostbuf=x)

        cl_sin = cl.Buffer(self._cl_context, mf.WRITE_ONLY,
                           self._fptype.itemsize*x.size)
        cl_cos = cl.Buffer(self._cl_context, mf.WRITE_ONLY,
                           self._fptype.itemsize*x.size)

        self._cl_exec.testMcSinCos(
            self._cl_queue, [x.size], None,
            self._uitype.type(x.size), cl_x, cl_sin, cl_cos).wait()

        sinout = np.zeros_like(x)
        cosout = np.zeros_like(x)
        cl.enqueue_copy(self._cl_queue, sinout, cl_sin).wait()
        cl.enqueue_copy(self._cl_queue, cosout, cl_cos).wait()
        return sinout, cosout

[docs]    def testMcSqrt(self, x, native=True):
        x = np.asarray(x, dtype=self._fptype)
        mf = cl.mem_flags
        cl_x = cl.Buffer(self._cl_context, mf.READ_ONLY |
                         mf.COPY_HOST_PTR, hostbuf=x)
        cl_out = cl.Buffer(self._cl_context, mf.WRITE_ONLY,
                           self._fptype.itemsize*x.size)

        if native:
            fun = self._cl_exec.testMcSqrtNative
        else:
            fun = self._cl_exec.testMcSqrt
        fun(self._cl_queue, [x.size], None,
            self._uitype.type(x.size), cl_x, cl_out).wait()

        out = np.zeros_like(x)
        cl.enqueue_copy(self._cl_queue, out, cl_out).wait()
        return out

[docs]    def testMcRSqrt(self, x, native=True):
        x = np.asarray(x, dtype=self._fptype)
        mf = cl.mem_flags
        cl_x = cl.Buffer(self._cl_context, mf.READ_ONLY |
                         mf.COPY_HOST_PTR, hostbuf=x)
        cl_out = cl.Buffer(self._cl_context, mf.WRITE_ONLY,
                           self._fptype.itemsize*x.size)

        if native:
            fun = self._cl_exec.testMcRSqrtNative
        else:
            fun = self._cl_exec.testMcRSqrt
        fun(self._cl_queue, [x.size], None,
            self._uitype.type(x.size), cl_x, cl_out).wait()

        out = np.zeros_like(x)
        cl.enqueue_copy(self._cl_queue, out, cl_out).wait()
        return out

[docs]    def testMcFpDiv(self, x, y, native=True):
        mf = cl.mem_flags

        x = np.asarray(x, dtype=self._fptype)
        y = np.asarray(y, dtype=self._fptype)
        if x.size != y.size:
            raise ValueError('Division parameters must be of the same size!')
        cl_x = cl.Buffer(self._cl_context, mf.READ_ONLY |
                         mf.COPY_HOST_PTR, hostbuf=x)
        cl_y = cl.Buffer(self._cl_context, mf.READ_ONLY |
                         mf.COPY_HOST_PTR, hostbuf=y)

        cl_out = cl.Buffer(self._cl_context, mf.WRITE_ONLY,
                           self._fptype.itemsize*x.size)

        if native:
            fun = self._cl_exec.testMcFpDivNative
        else:
            fun = self._cl_exec.testMcFpDiv
        fun(self._cl_queue, [x.size], None,
            self._uitype.type(x.size), cl_x, cl_y, cl_out).wait()

        out = np.zeros_like(x)
        cl.enqueue_copy(self._cl_queue, out, cl_out).wait()

        return out

[docs]    def testMcLog(self, x, native=True):
        x = np.asarray(x, dtype=self._fptype)
        mf = cl.mem_flags
        cl_x = cl.Buffer(self._cl_context, mf.READ_ONLY |
                         mf.COPY_HOST_PTR, hostbuf=x)
        cl_out = cl.Buffer(self._cl_context, mf.WRITE_ONLY,
                           self._fptype.itemsize*x.size)

        if native:
            fun = self._cl_exec.testMcFpLogNative
        else:
            fun = self._cl_exec.testMcFpLog
        fun(self._cl_queue, [x.size], None,
            self._uitype.type(x.size), cl_x, cl_out).wait()

        out = np.zeros_like(x)
        cl.enqueue_copy(self._cl_queue, out, cl_out).wait()
        return out

[docs]    def devDataTypes(self):
        mf = cl.mem_flags
        s = np.zeros([16], dtype=np.uint32)
        cl_s = cl.Buffer(self._cl_context, mf.WRITE_ONLY |
                         mf.COPY_HOST_PTR, hostbuf=s)

        self._cl_exec.dataTypes(
            self._cl_queue, [1], None,
            cl_s)

        cl.enqueue_copy(self._cl_queue, s, cl_s).wait()

        return {'McInt':s[0], 'McUInt':s[1],
                'McUInt32':s[2], 'McUInt64':s[3],
                'McFluenceInt':s[4], 'McRtInt':s[5],
                'McFloat':s[6], 'McSim':s[7], 'McPf':s[8]}

if __name__ == '__main__':
    import os.path

    import matplotlib.pyplot as pp

    from xopto import USER_TMP_PATH
    from xopto.mccyl.mcutil.fiber import MultimodeFiber, MultimodeFiberLut, FiberLayout
    from xopto.mccyl.mcutil.lut import EmissionLut
    from xopto.cl import clinfo
    from xopto.pf import Hg, Mie
    from xopto import make_user_dirs


    class Coating(mcsurface.SurfaceLayoutAny):
        def cl_type(self, mc: McObject) -> cltypes.Structure:
            T = mc.types
            class ClCoating(cltypes.Structure):
                _fields_ = [
                    ('absorption', T.mc_fp_t),
                    ('window_height', T.mc_fp_t),
                    ('beam_block_r', T.mc_fp_t),
                    ('specular_fraction', T.mc_fp_t)
                ]
            return ClCoating

        def cl_declaration(self, mc: McObject) -> str:
            '''
            Structure that defines the source in the Monte Carlo simulator.
            '''
            Loc = self.location.capitalize()
            return '\n'.join((
                'struct MC_STRUCT_ATTRIBUTES Mc{}SurfaceLayout {{'.format(Loc),
                '	mc_fp_t absorption;',
                '	mc_fp_t window_height;',
                '	mc_fp_t beam_block_r;',
                '	mc_fp_t specular_fraction;',
                '};'
            ))

        def fetch_cl_implementation(self, mc: 'McObject') -> str:
            loc = self.location
            Loc = loc.capitalize()
            return '\n'.join((
                'void dbg_print_{}_surface_layout('.format(loc),
                '		__mc_surface_mem const Mc{}SurfaceLayout *layout){{'.format(Loc),
                '	dbg_print("Mc{}SurfaceLayout:");'.format(Loc),
                '	dbg_print_float(INDENT "absorption:", layout->absorption);',
                '	dbg_print_float(INDENT "window_height:", layout->window_height);',
                '	dbg_print_float(INDENT "beam_block_r:", layout->beam_block_r);',
                '	dbg_print_float(INDENT "specular_fraction:", layout->specular_fraction);',
                '};',
                '',
                'inline int mcsim_{}_surface_layout_handler('.format(loc),
	            '		McSim *psim, mc_int_t next_layer_index, ',
                '		mc_fp_t *n2, mc_fp_t *cc){',
                '',
                '	__mc_surface_mem const Mc{}SurfaceLayout *layout = '.format(Loc),
                '		mcsim_{}_surface_layout(psim);'.format(loc),
                '	mc_point3f_t *pos = mcsim_position(psim);',
                '',
                '	mc_int_t current_layer_index = mcsim_current_layer_index(psim);',
                '	mc_size_t num_layers = mcsim_layer_count(psim);',
                '',
                '	if (!((next_layer_index == num_layers - 2 &&',
                '			current_layer_index == num_layers - 1) ||',
                '			(next_layer_index == num_layers - 1 &&',
                '			current_layer_index == num_layers - 2) || next_layer_index == 0)) {',
                '		return MC_SURFACE_LAYOUT_CONTINUE;',
                '	}',
                '',
                '	mc_fp_t offset = pos->y*pos->y + pos->z*pos->z;'
                '	if (offset < mc_fsquare(layout->beam_block_r) && pos->x > FP_0) {',
                '		mcsim_adjust_weight(psim, mcsim_weight(psim)*layout->absorption);',
                '	} else if (mc_fabs(pos->z) < layout->window_height*0.5 && pos->y >= FP_0) {',
                '		return MC_SURFACE_LAYOUT_CONTINUE;',
                '	} else {',
                '		mcsim_adjust_weight(psim, mcsim_weight(psim)*layout->absorption);',
                '	}',
                '',
                '	if (mcsim_weight(psim) <= FP_0)',
                '		return MC_SURFACE_LAYOUT_CONTINUE;'
                '',
                '	mc_point3f_t new_dir;',
                '	mc_point3f_t normal;',
                '	radial_normal(pos, &normal);',
                '	reflect(mcsim_direction(psim), &normal, &new_dir);',
                '',
                '	if (mcsim_random(psim) > layout->specular_fraction) {',
                '		scatter_direction(&new_dir, ',
                '			mc_sqrt(FP_1 - mcsim_random(psim)),',
                '			FP_2PI*mcsim_random(psim));',
                '	};',
                '',
                '	mcsim_set_direction(psim, &new_dir);',
                '',
                '	return MC_REFLECTED;',
                '};'
            ))

        def __init__(self, absorption: float = 0.99,
                     window_height: float = 1e-3,
                     beam_block_diameter: float = 1e-3,
                     specular: float = 1.0):
            self._absorption = 1.0
            self._window_height = 1e-3
            self._beam_block_diameter = 1e-3
            self._specular = 0.0

            self._set_absorption(absorption)
            self._set_specular(specular)
            self._set_window_height(window_height)
            self._set_beam_block_diameter(beam_block_diameter)

            super().__init__()

        def _get_window_height(self) -> float:
            return self._window_height
        def _set_window_height(self, height: float):
            self._window_height = max(0.0, float(height))
        window_height = property(_get_window_height, _set_window_height, None,
                                 'Clear window height (m).')

        def _get_beam_block_diameter(self) -> float:
            return self._beam_block_diameter
        def _set_beam_block_diameter(self, diameter: float):
            self._beam_block_diameter = max(0.0, float(diameter))
        beam_block_diameter = property(
            _get_beam_block_diameter, _set_beam_block_diameter, None,
            'Beam block coating spot diameter (m).')

        def _get_absorption(self) -> float:
            return self._absorption
        def _set_absorption(self, absorption: float):
            self._absorption = min(max(0.0, float(absorption)), 1.0)
        absorption = property(_get_absorption, _set_absorption, None,
                              'Coating absorption as a fraction from 0 to 1.')

        def _get_specular(self) -> float:
            return self._specular
        def _set_specular(self, absorption: float):
            self._specular = min(max(0.0, float(absorption)), 1.0)
        specular = property(_get_specular, _set_specular, None,
                            'Fraction of unabsorbed light that is reflected '
                            'specularly.')

        def cl_pack(self, mc: McObject, target: cltypes.Structure = None) \
                -> cltypes.Structure:
            if target is None:
                target_type = self.cl_type(mc)
                target = target_type()

            target.absorption = self._absorption
            target.window_height = self._window_height
            target.beam_block_r = self._beam_block_diameter*0.5
            target.specular = self._specular

            return target

        def __str__(self):
            return 'Coating(absorption={}, window_height={}, '\
                   'beam_block_diameter={}, specular={})'.format(
                       self._absorption, self._window_height,
                        self._beam_block_diameter, self._specular)

        def __repr__(self):
            return '{} #{}'.format(self.__str__(), id(self))


    exportsrc = os.path.join(USER_TMP_PATH, 'mccyl.h')
    make_user_dirs()

    cl_device = clinfo.gpus()[0]
    cl_device = clinfo.cl.CommandQueue(clinfo.cl.Context([cl_device]))

    cl_build_options = [cloptions.FastMath]

    pf = mcpf.Hg(0.7)
    mie_pf = Mie(1.6, 1.33, 1.0e-6, 500e-9)
    pf = mcpf.LutEx(Mie, (1.6, 1.33, 1.0e-6, 500e-9), lutsize=8000)
    layers = mclayer.Layers([
        mclayer.Layer(d=0.0,       n=1.00, mua=0.0, mus=0.0, pf=pf),
        mclayer.Layer(d=100.0e-3,  n=1.00, mua=0.0, mus=0.0, pf=pf),
        mclayer.Layer(d=2.0e-3,    n=1.45, mua=0.0, mus=0.0e2, pf=pf),
        mclayer.Layer(d=1.0e-3,    n=1.33, mua=0.0, mus=0.25e2, pf=pf),
    ])
    source = mcsource.Line()
    #source = mcsource.GaussianBeam(50e-6)
    #source = mcsource.UniformBeam(1e-6)
    #source = mcsource.IsotropicPoint(0.0)

    detectors = mcdetector.Detectors(
        outer=mcdetector.FiZ(
            mcdetector.Axis(-np.pi, np.pi, 1001),
            mcdetector.Axis(-1.5e-3, 1.5e-3, 3),
        ),
        specular=mcdetector.Total()
    )

    surface = mcsurface.SurfaceLayouts(
        internal=Coating(beam_block_diameter=5.0e-6, absorption=1.0,
                      specular=0.0, window_height=1e-3),
        outer=Coating(beam_block_diameter=5.0e-6, absorption=1.0,
                      specular=0.0, window_height=1e-3)
    ) 

    mc_obj = Mc(layers, source, detectors, surface=surface,
                cl_devices=cl_device, cl_build_options=cl_build_options,
                options=[mcoptions.McDebugMode.off, mcoptions.McUseSoft64Atomics.off,
                         mcoptions.McMinimumPacketWeight.default],
                types=mctypes.McDataTypes)
    mc_obj.rmax = 1000e-3

    out = None
    num_packets  = 100e6
    batch_packets = 100e6
    num_packets_done = 0
    while (num_packets_done < num_packets):
        out = mc_obj.run(batch_packets, out, exportsrc=exportsrc, verbose=True)
        num_packets_done += batch_packets
        print('Processed {}/{}'.format(num_packets_done, num_packets))

    _, _, detectors_res = out
    outer = detectors_res.outer
    reflectance = outer.reflectance[1, ...]
    reflectance[-1] = reflectance[-2]
    reflectance[0] = reflectance[1]
    #center = int(reflectance.size//2)
    #reflectance[center] = 0.5*(reflectance[center - 1] + reflectance[center + 1])

    pf_ref = mie_pf(np.cos(outer.fi))

    fig = pp.figure()

    ax1 = fig.add_subplot(2, 2, 1)
    ax2 = fig.add_subplot(2, 2, 2)
    ax3 = fig.add_subplot(2, 2, 3)
    ax1.semilogy(np.rad2deg(outer.fi), reflectance/reflectance.max())
    ax2.semilogy(np.rad2deg(outer.fi), reflectance/reflectance.max(), '-r', label='MC')
    ax2.semilogy(np.rad2deg(outer.fi), pf_ref/pf_ref.max(), '-g', label='Mie')
    ax2.legend()
    k = pf_ref.max()/reflectance.max()
    ax3.plot(np.rad2deg(outer.fi), (k*reflectance - pf_ref)/pf_ref)


    r_sample = mc_obj.layers[1].d*0.5
    eps = np.finfo(mc_obj.types.np_float).eps
    f = mctrace.Filter(
        dir=(np.cos(np.deg2rad(10.0)), 1.0 - eps, (-1.0, 0.0, 0.0)),
        r=(r_sample - eps, r_sample + eps, (0.0, 0.0)),
        z=(-0.5e-3, 0.5e-3), y=(0.0, np.inf)
    )
    trace = mctrace.Trace(maxlen=8, filter=f)
    mc_obj = Mc(layers, source, trace=trace, surface=surface,
                options=[mcoptions.McDebugMode.off])
    mc_obj.rmax = 1000e-3
    out = None
    n = 0
    batch_packets = 1000000
    while out is None or len(out[0]) < 10:
        out = mc_obj.run(batch_packets, out)
        n += batch_packets
        print('Processed {:,} packets, Collected traces: {:d}'.format(
            n, len(out[0])))

    trace_res = out[0]
    ax = fig.add_subplot(2, 2, 4)
    #fig = pp.figure()
    #ax = fig.add_subplot(1, 1, 1)
    for layer in mc_obj.layers[1:]:
        c = pp.Circle((0.0, 0.0), layer.d*0.5, fill=False)
        ax.add_artist(c)
    ax.set_xlim(-r_sample, r_sample)
    ax.set_ylim(-r_sample, r_sample)
    trace_res.plot(ax=ax, view='xy')
    ax.set_aspect('equal')
    pp.show()