Source code for instagraal.cuda_lib_gl_single

#!/usr/bin/env python3

import numpy as np

# import pyopencl as cl
import pycuda.tools
from pycuda import characterize
import pycuda.driver as cuda
import pycuda.compiler
from instagraal.gpustruct import GPUStruct
from pycuda import gpuarray as ga

# from pycuda.scan import InclusiveScanKernel

import pycuda.gl as cudagl

import codepy.cgen as cgen
from codepy.bpl import BoostPythonModule
from codepy.cuda import CudaModule
import codepy.jit
import codepy.toolchain

import time
import matplotlib.pyplot as plt

# import optim_rippe_curve as opti1
import instagraal.optim_rippe_curve_update as opti
import instagraal.init_nuisance as nuis

# from OpenGL.arrays import vbo
import scipy as scp

from instagraal import log
from instagraal.log import logger

import pkg_resources

logger.setLevel(log.CURRENT_LOG_LEVEL)

# from scipy import ndimage as ndi
# from scipy import stats
# import Image


[docs]class sampler:
    def __init__(
        self,
        use_rippe,
        S_o_A_frags,
        collector_id_repeats,
        frag_dispatcher,
        id_frag_duplicated,
        id_frags_blacklisted,
        n_frags,
        n_new_frags,
        init_n_sub_frags,
        n_new_sub_frags,
        np_rep_sub_frags_id,
        sub_sampled_sparse_matrix,
        np_sub_frags_len_bp,
        np_sub_frags_id,
        np_sub_frags_accu,
        np_sub_frags_2_frags,
        mean_squared_frags_per_bin,
        norm_vect_accu,
        sub_candidates_dup,
        sub_candidates_output_data,
        S_o_A_sub_frags,
        sub_collector_id_repeats,
        sub_frag_dispatcher,
        sparse_matrix,
        mean_value_trans,
        n_iterations,
        is_simu,
        gl_window,
        pos_vbo,
        col_vbo,
        vel,
        pos,
        raw_im_init,
        pbo_im_buffer,
        gl_size_im,
    ):
        self.o = 0

        self.log_e = 0.43429448190325182
        self.sparse_matrix = sparse_matrix + sparse_matrix.transpose()
        self.n_single_frags = np.int32(
            init_n_sub_frags - len(sub_candidates_dup)
        )
        self.sub_sampled_sparse_matrix = sub_sampled_sparse_matrix

        self.np_sub_frags_2_frags = np_sub_frags_2_frags
        self.gpu_sub_frags_2_frags = cuda.mem_alloc(
            self.np_sub_frags_2_frags.nbytes
        )
        cuda.memcpy_htod(self.gpu_sub_frags_2_frags, self.np_sub_frags_2_frags)

        # repeatead candidates info
        self.sub_candidates_dup = sub_candidates_dup
        self.sub_candidates_output_data = sub_candidates_output_data
        self.init_n_sub_frags = np.int32(init_n_sub_frags)
        self.sparse_data_2_gpu()
        self.use_rippe = use_rippe
        self.gl_window = gl_window
        self.ctx = gl_window.ctx_gl
        self.pos_vbo = pos_vbo
        self.col_vbo = col_vbo
        self.pos = pos
        self.vel = vel
        self.load_gl_cuda_vbo()

        self.id_frags_blacklisted = id_frags_blacklisted
        self.int4 = np.dtype(
            [
                ("x", np.int32),
                ("y", np.int32),
                ("z", np.int32),
                ("n", np.int32),
            ],
            align=True,
        )
        self.float3 = np.dtype(
            [("x", np.float32), ("y", np.float32), ("z", np.float32)],
            align=True,
        )
        self.int3 = np.dtype(
            [("x", np.int32), ("y", np.int32), ("z", np.int32)], align=True
        )

        self.np_id_frag_duplicated = np.int32(id_frag_duplicated)
        self.id_frag_duplicated = id_frag_duplicated
        self.n_frags = np.int32(n_frags)
        self.n_new_frags = np.int32(n_new_frags)

        self.n_new_sub_frags = np.int32(n_new_sub_frags)

        self.uniq_frags = np.int32(
            np.lib.arraysetops.setdiff1d(
                np.arange(0, self.n_frags, dtype=np.int32),
                self.np_id_frag_duplicated,
            )
        )
        self.n_frags_uniq = np.int32(len(self.uniq_frags))

        self.n_values_triu = np.int32(self.n_frags * (self.n_frags - 1) / 2)
        self.init_n_values_triu = np.int32(
            self.n_frags * (self.n_frags - 1) / 2
        )
        self.new_n_values_triu = np.int32(
            self.n_new_frags * (self.n_new_frags - 1) / 2
        )
        self.init_n_sub_values_triu = np.int32(
            self.init_n_sub_frags * (self.init_n_sub_frags - 1) / 2
        )
        self.new_n_sub_values_triu = np.int32(
            self.n_new_sub_frags * (self.n_new_sub_frags - 1) / 2
        )

        self.init_n_values_triu_extra = self.init_n_values_triu + self.n_frags
        self.init_n_sub_values_triu_extra = (
            self.init_n_sub_values_triu + self.init_n_sub_frags
        )

        self.n_insert_blocks = 6
        # self.n_insert_blocks = 0
        self.n_tmp_struct = 12 + self.n_insert_blocks * 2
        if self.n_insert_blocks == 0:
            self.active_insert_blocks = False
            self.size_block_4_sub = 128
        else:
            self.active_insert_blocks = True
            self.size_block_4_sub = 64

        self.is_simu = is_simu
        self.norm_vect_accu = norm_vect_accu
        self.np_sub_frags_len_bp = np_sub_frags_len_bp
        self.np_sub_frags_id = np_sub_frags_id
        self.np_sub_frags_accu = np_sub_frags_accu
        # self.hic_matrix_sub_sampled = hic_matrix_sub_sampled
        self.mean_squared_frags_per_bin = np.float32(
            mean_squared_frags_per_bin
        )
        # print "size hic matrix = ", hic_matrix.nbytes/10**6

        self.collector_id_repeats = collector_id_repeats
        self.gpu_collector_id_repeats = ga.to_gpu(
            ary=self.collector_id_repeats
        )
        self.frag_dispatcher = frag_dispatcher
        self.gpu_frag_dispatcher = cuda.mem_alloc(self.frag_dispatcher.nbytes)
        cuda.memcpy_htod(self.gpu_frag_dispatcher, self.frag_dispatcher)
        self.np_rep_sub_frags_id = np_rep_sub_frags_id
        self.gpu_rep_sub_frags_id = cuda.mem_alloc_like(
            self.np_rep_sub_frags_id
        )
        cuda.memcpy_htod(self.gpu_rep_sub_frags_id, self.np_rep_sub_frags_id)
        self.gpu_uniq_frags = ga.to_gpu(ary=self.uniq_frags)

        self.sub_collector_id_repeats = sub_collector_id_repeats
        self.sub_frag_dispatcher = sub_frag_dispatcher
        self.gpu_sub_frag_dispatcher = cuda.mem_alloc_like(
            self.sub_frag_dispatcher
        )
        cuda.memcpy_htod(
            self.gpu_sub_frag_dispatcher, self.sub_frag_dispatcher
        )
        self.gpu_sub_collector_id_repeats = cuda.mem_alloc_like(
            self.sub_collector_id_repeats
        )
        cuda.memcpy_htod(
            self.gpu_sub_collector_id_repeats, self.sub_collector_id_repeats
        )

        self.S_o_A_frags = S_o_A_frags
        self.S_o_A_sub_frags = S_o_A_sub_frags
        self.mean_n_accu = np.int32(
            np.round(self.S_o_A_frags["n_accu"].mean())
        )
        self.mean_len_bp_frags = self.S_o_A_sub_frags["len_bp"].mean()

        self.mean_value_trans = mean_value_trans

        self.param_simu_rippe = np.dtype(
            [
                ("kuhn", np.float32),
                ("lm", np.float32),
                ("c1", np.float32),
                ("slope", np.float32),
                ("d", np.float32),
                ("d_max", np.float32),
                ("fact", np.float32),
                ("v_inter", np.float32),
            ],
            align=True,
        )

        self.param_simu_exp = np.dtype(
            [
                ("d0", np.float32),
                ("d_max", np.float32),
                ("alpha_0", np.float32),
                ("alpha_1", np.float32),
                ("fact", np.float32),
                ("v_inter", np.float32),
            ],
            align=True,
        )
        if self.use_rippe:
            self.param_simu_T = self.param_simu_rippe
        else:
            self.param_simu_T = self.param_simu_exp

        self.setup_all_gpu_struct()

        self.n_iterations = n_iterations

        self.np_init_prev = np.copy(self.S_o_A_frags["prev"])
        self.np_init_next = np.copy(self.S_o_A_frags["next"])
        self.np_init_orientable = []
        for idf in range(0, self.n_new_frags):
            id_d = self.S_o_A_frags["id_d"][idf]
            self.np_init_orientable.append(self.np_sub_frags_id[id_d]["w"] > 1)
        self.np_init_orientable = np.array(
            self.np_init_orientable, dtype=np.int32
        )
        self.np_init_ori = np.ones((self.n_new_frags,), dtype=np.int32)
        # ########################
        self.n_generators = 100
        seed = 1
        self.rng_states = cuda.mem_alloc(
            self.n_generators
            * characterize.sizeof(
                "curandStateXORWOW", "#include <curand_kernel.h>"
            )
        )

        (free, total) = cuda.mem_get_info()
        logger.debug(
            (
                "Global memory occupancy after init:%f%% free"
                % (free * 100. / total)
            )
        )
        logger.debug(
            ("Global free memory after init:%i Mo free" % (free / 10 ** 6.))
        )

        logger.info("loading kernels ...")
        kernel_adapt_entry_point = pkg_resources.resource_filename(
            "instagraal", "kernels/kernel_sparse_adapt.cu"
        )
        kernel_entry_point = pkg_resources.resource_filename(
            "instagraal", "kernels/kernel_sparse.cu"
        )
        if self.active_insert_blocks:
            self.loadProgram(kernel_adapt_entry_point)
        else:
            self.loadProgram(kernel_entry_point)
        logger.info("kernels compiled")

        self.stride = 200

        seed = 1
        self.init_rng(
            np.int32(self.n_generators),
            self.rng_states,
            np.uint64(seed),
            np.uint64(0),
            block=(64, 1, 1),
            grid=(self.n_generators // 64 + 1, 1),
        )
        self.setup_distri_frags()

        # THRUST MODULE ##
        # Make a host_module, compiled for CPU
        self.setup_thrust_modules()

        # self.texref = self.module.get_texref("tex")
        self.raw_im_init = raw_im_init
        self.pbo_im_buffer = pbo_im_buffer
        self.gl_size_im = gl_size_im
        precision = 1
        self.sparse_data_4_gl(precision)
        self.load_gl_cuda_tex_buffer(self.raw_im_init)
        self.im_thresh = 50

[docs]    def setup_all_gpu_struct(self,):

        self.n_threads_mutations = int(
            np.power(2, np.floor(np.log2(self.n_tmp_struct)) + 1)
        )
        self.gpu_vect_frags = self.create_gpu_struct(data=self.S_o_A_frags)

        self.cpu_id_contigs = np.copy(self.S_o_A_frags["id_c"])
        self.gpu_id_contigs = ga.to_gpu(self.cpu_id_contigs)
        self.collector_gpu_vect_frags = []

        for k in range(0, self.n_tmp_struct):
            self.collector_gpu_vect_frags.append(
                self.create_gpu_struct(data=None)
            )

        sub_vect_dist = np.ones(
            (self.n_new_sub_frags * self.n_tmp_struct,), dtype=np.float32
        )
        self.collect_gpu_vect_dist = ga.to_gpu(sub_vect_dist)
        sub_vect_id_c = np.ones(
            (self.n_new_sub_frags * self.n_tmp_struct,), dtype=np.int32
        )
        self.collect_gpu_vect_id_c = ga.to_gpu(sub_vect_id_c)
        sub_vect_s_tot = np.ones(
            (self.n_new_sub_frags * self.n_tmp_struct,), dtype=np.float32
        )
        self.collect_gpu_vect_s_tot = ga.to_gpu(sub_vect_s_tot)
        sub_vect_pos = np.ones(
            (self.n_new_sub_frags * self.n_tmp_struct,), dtype=np.int32
        )
        self.collect_gpu_vect_pos = ga.to_gpu(sub_vect_pos)
        sub_vect_len = np.ones(
            (self.n_new_sub_frags * self.n_tmp_struct,), dtype=np.int32
        )
        self.collect_gpu_vect_len = ga.to_gpu(sub_vect_len)

        self.pop_gpu_vect_frags = self.create_gpu_struct(data=None)
        self.scrambled_gpu_vect_frags = self.create_gpu_struct(data=None)
        self.pop_cpu_id_contigs = np.copy(self.cpu_id_contigs)
        self.pop_gpu_id_contigs = ga.to_gpu(self.pop_cpu_id_contigs)
        self.trans1_gpu_vect_frags = self.create_gpu_struct(data=None)
        self.trans1_cpu_id_contigs = np.copy(self.cpu_id_contigs)
        self.trans1_gpu_id_contigs = ga.to_gpu(self.trans1_cpu_id_contigs)
        self.trans2_gpu_vect_frags = self.create_gpu_struct(data=None)
        self.trans2_cpu_id_contigs = np.copy(self.cpu_id_contigs)
        self.trans2_gpu_id_contigs = ga.to_gpu(self.trans2_cpu_id_contigs)

        self.gpu_counter_select = ga.zeros(1, dtype=np.int32)
        self.gpu_counter_select_gl = ga.zeros(1, dtype=np.int32)
        self.gpu_list_len_cont = ga.zeros(self.n_frags, dtype=np.int32)
        self.gpu_likelihood_on_zeros = ga.zeros(1, dtype=np.float64)
        self.gpu_likelihood_on_zeros_nuis = ga.zeros(1, dtype=np.float64)
        self.gpu_vect_likelihood_z = ga.zeros(
            self.n_tmp_struct, dtype=np.float64
        )
        self.gpu_n_vals_intra = ga.zeros(1, dtype=np.int32)
        self.gpu_all_n_vals_intra = ga.zeros(self.n_tmp_struct, dtype=np.int32)
        self.gpu_n_tot_sub_frags = ga.zeros(1, dtype=np.int32)
        self.n_pixl_sub_mat = (
            self.init_n_sub_frags * (self.init_n_sub_frags - 1) / 2
        )

        sub_vect_dist = np.ones((self.n_new_sub_frags,), dtype=np.float32)
        self.gpu_vect_dist = ga.to_gpu(sub_vect_dist)
        sub_vect_id_c = np.ones((self.n_new_sub_frags,), dtype=np.int32)
        self.gpu_vect_id_c = ga.to_gpu(sub_vect_id_c)
        sub_vect_s_tot = np.ones((self.n_new_sub_frags,), dtype=np.float32)
        self.gpu_vect_s_tot = ga.to_gpu(sub_vect_s_tot)
        sub_vect_pos = np.ones((self.n_new_sub_frags,), dtype=np.int32)
        self.gpu_vect_pos = ga.to_gpu(sub_vect_pos)
        sub_vect_len = np.ones((self.n_new_sub_frags,), dtype=np.int32)
        self.gpu_vect_len = ga.to_gpu(sub_vect_len)

        self.gpu_vect_dist_mut = ga.zeros(
            (self.n_new_sub_frags,), dtype=np.float32
        )
        self.gpu_vect_id_c_mut = ga.zeros(
            (self.n_new_sub_frags,), dtype=np.int32
        )
        self.gpu_vect_s_tot_mut = ga.zeros(
            (self.n_new_sub_frags,), dtype=np.float32
        )
        self.gpu_vect_pos_mut = ga.zeros(
            (self.n_new_sub_frags,), dtype=np.int32
        )
        self.gpu_vect_len_mut = ga.zeros(
            (self.n_new_sub_frags,), dtype=np.int32
        )

        self.gpu_list_uniq_mutations = ga.zeros(
            (self.n_tmp_struct,), dtype=np.int32
        )
        self.gpu_n_uniq = ga.zeros((1,), dtype=np.int32)
        self.gpu_n_pixl_sub_mat = ga.zeros((1,), dtype=np.float64)
        self.gpu_n_pixl_sub_mat.fill(np.float64(self.n_pixl_sub_mat))

        self.gpu_sub_sp_block_indptr = ga.to_gpu(
            np.zeros((self.n_non_zero,), dtype=np.int32)
        )
        self.gpu_info_blocks = ga.to_gpu(
            np.zeros(
                (int(self.n_non_zero / self.size_block_4_sub + 1),),
                dtype=self.int3,
            )
        )

        self.gpu_sub_vect_likelihood_nz = ga.to_gpu(
            np.zeros((self.n_tmp_struct,), dtype=np.float64)
        )
        self.gpu_curr_likelihood_nz_extract = ga.zeros(1, dtype=np.float64)

        self.gpu_all_scores = ga.to_gpu(
            np.zeros((self.n_tmp_struct,), dtype=np.float64)
        )

        self.gpu_curr_likelihood_nz = ga.to_gpu(
            np.zeros((1,), dtype=np.float64)
        )
        self.gpu_curr_likelihood_nz_nuis = ga.to_gpu(
            np.zeros((1,), dtype=np.float64)
        )
        self.gpu_val_likelihood_4_mut = ga.to_gpu(
            np.zeros((1,), dtype=np.float64)
        )

        self.gpu_uniq_id_c = ga.zeros((self.n_new_sub_frags,), dtype=np.int32)
        self.gpu_uniq_len = ga.zeros((self.n_new_sub_frags,), dtype=np.int32)
        self.gpu_old_2_new_id_c = ga.zeros(
            np.int32((self.n_new_sub_frags + self.n_new_sub_frags / 10,)),
            dtype=np.int32,
        )  # security length

        if self.active_insert_blocks:
            # self.gpu_list_bounds = ga.to_gpu(ary=np.array(range(1,
            # self.n_insert_blocks + 1), dtype=np.int32))
            list_size = np.array(
                [1, 3, 5, 10, 20, 50, 200, 200], dtype=np.int32
            )
            self.max_bounds_insert = list_size[
                : self.n_insert_blocks
            ].max() * np.int32(
                np.round(self.S_o_A_frags["sub_len"].mean()) + 1
            )  # approximation of sub size to extract in full matrix
            self.gpu_list_valid_insert = ga.zeros(
                (self.n_insert_blocks * 2), dtype=np.int32
            )
            self.gpu_list_bounds = ga.to_gpu(
                ary=np.array(list_size[: self.n_insert_blocks], dtype=np.int32)
            )
            self.gpu_list_f_upstream = ga.zeros(
                (self.n_insert_blocks,), dtype=np.int32
            )
            self.gpu_list_f_downstream = ga.zeros(
                (self.n_insert_blocks,), dtype=np.int32
            )

[docs]    def setup_thrust_modules(self):

        host_mod = BoostPythonModule()
        # Make a device module, compiled with NVCC
        nvcc_mod = CudaModule(host_mod)
        # Describe device module code
        # NVCC includes
        nvcc_includes = [
            "thrust/sort.h",
            "thrust/iterator/zip_iterator.h",
            "thrust/device_vector.h",
            "cuda.h",
            "thrust/scan.h",
        ]
        # Add includes to module
        nvcc_mod.add_to_preamble([cgen.Include(x) for x in nvcc_includes])
        # NVCC function sort by keys 3 cuda arrays
        nvcc_function_sort_by_keys_zip = cgen.FunctionBody(
            cgen.FunctionDeclaration(
                cgen.Value("void", "my_sort_zip"),
                [
                    cgen.Value("CUdeviceptr", "input_ptr_keys"),
                    cgen.Value("int", "length"),
                    cgen.Value("CUdeviceptr", "input_ptr_valsa"),
                    cgen.Value("CUdeviceptr", "input_ptr_valsb"),
                ],
            ),
            cgen.Block(
                [
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr((int*)input_ptr_keys)"
                    ),
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr_valsa((int*)input_ptr_valsa)"
                    ),
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr_valsb((int*)input_ptr_valsb)"
                    ),
                    cgen.Statement(
                        "thrust::tuple< thrust::device_ptr<int>, "
                        "thrust::device_ptr<int> > keytup_begin = "
                        "thrust::make_tuple(thrust_ptr_valsa,thrust_ptr_valsb)"
                    ),
                    cgen.Statement(
                        "thrust::zip_iterator<thrust::tuple"
                        "<thrust::device_ptr<int>, thrust::device_ptr<int> > >"
                        " first = thrust::make_zip_iterator(keytup_begin)"
                    ),
                    cgen.Statement(
                        "thrust::stable_sort_by_key(thrust_ptr, "
                        "thrust_ptr+length, first)"
                    ),
                ]
            ),
        )

        # Add declaration to nvcc_mod
        # Adds declaration to host_mod as well
        nvcc_mod.add_function(nvcc_function_sort_by_keys_zip)

        # NVCC function sort by keys 3 cuda arrays
        nvcc_function_sort_by_keys_zip_cmplex = cgen.FunctionBody(
            cgen.FunctionDeclaration(
                cgen.Value("void", "my_sort_zip_cmplex"),
                [
                    cgen.Value("CUdeviceptr", "input_ptr_keys_a"),
                    cgen.Value("CUdeviceptr", "input_ptr_keys_b"),
                    cgen.Value("int", "length"),
                    cgen.Value("CUdeviceptr", "input_ptr_vals"),
                ],
            ),
            cgen.Block(
                [
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr_v((int*)input_ptr_vals)"
                    ),
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr_keysa((int*)input_ptr_keys_a)"
                    ),
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr_keysb((int*)input_ptr_keys_b)"
                    ),
                    cgen.Statement(
                        "thrust::tuple< thrust::device_ptr<int>, "
                        "thrust::device_ptr<int> > keytup_begin = "
                        "thrust::make_tuple(thrust_ptr_keysa,thrust_ptr_keysb)"
                    ),
                    cgen.Statement(
                        "thrust::zip_iterator<thrust::tuple<thrust::device_ptr"
                        "<int>, thrust::device_ptr<int> > > first = "
                        "thrust::make_zip_iterator(keytup_begin)"
                    ),
                    cgen.Statement(
                        "thrust::stable_sort_by_key(first, first+length, "
                        "thrust_ptr_v)"
                    ),
                ]
            ),
        )

        # Add declaration to nvcc_mod
        # Adds declaration to host_mod as well
        nvcc_mod.add_function(nvcc_function_sort_by_keys_zip_cmplex)

        # NVCC function sort by keys a 2 cuda arrays
        nvcc_function_sort_by_keys_simple = cgen.FunctionBody(
            cgen.FunctionDeclaration(
                cgen.Value("void", "my_sort_simple"),
                [
                    cgen.Value("CUdeviceptr", "input_ptr_keys"),
                    cgen.Value("int", "length"),
                    cgen.Value("CUdeviceptr", "input_ptr_vals"),
                ],
            ),
            cgen.Block(
                [
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr((int*)input_ptr_keys)"
                    ),
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr_v((int*)input_ptr_vals)"
                    ),
                    cgen.Statement(
                        "thrust::stable_sort_by_key(thrust_ptr, "
                        "thrust_ptr+length, thrust_ptr_v, "
                        "thrust::greater<int>())"
                    ),
                ]
            ),
        )
        # Add declaration to nvcc_mod
        # Adds declaration to host_mod as well
        nvcc_mod.add_function(nvcc_function_sort_by_keys_simple)

        # NVCC function prefix sum
        nvcc_function_prefix_sum = cgen.FunctionBody(
            cgen.FunctionDeclaration(
                cgen.Value("void", "my_prefix_sum"),
                [
                    cgen.Value("CUdeviceptr", "input_ptr_vals"),
                    cgen.Value("int", "length"),
                ],
            ),
            cgen.Block(
                [
                    cgen.Statement(
                        "thrust::device_ptr<int> "
                        "thrust_ptr_v((int*)input_ptr_vals)"
                    ),
                    cgen.Statement(
                        "thrust::exclusive_scan(thrust_ptr_v, "
                        "thrust_ptr_v+length,thrust_ptr_v)"
                    ),
                ]
            ),
        )
        # Add declaration to nvcc_mod
        # Adds declaration to host_mod as well
        nvcc_mod.add_function(nvcc_function_prefix_sum)

        host_includes = ["boost/python/extract.hpp"]
        # Add host includes to module
        host_mod.add_to_preamble([cgen.Include(x) for x in host_includes])

        host_namespaces = ["using namespace boost::python"]

        # Add BPL using statement
        host_mod.add_to_preamble([cgen.Statement(x) for x in host_namespaces])

        host_statements_sort_zip = [
            # Extract information from PyCUDA GPUArray
            # Get length
            # 'tuple shape = extract<tuple>(gpu_array_keys.attr("shape"))',
            "int length = n_vals",
            # 'int length = extract<int>(shape[0])',
            # Get data pointer
            "CUdeviceptr ptr_keys = "
            'extract<CUdeviceptr>(gpu_array_keys.attr("gpudata"))',
            "CUdeviceptr ptr_valsa = "
            'extract<CUdeviceptr>(gpu_array_valsa.attr("gpudata"))',
            "CUdeviceptr ptr_valsb = "
            'extract<CUdeviceptr>(gpu_array_valsb.attr("gpudata"))',
            # Call Thrust routine, compiled into the CudaModule
            "my_sort_zip(ptr_keys, length, ptr_valsa, ptr_valsb)",
            # Return result
            "return gpu_array_keys",
        ]
        host_mod.add_function(
            cgen.FunctionBody(
                cgen.FunctionDeclaration(
                    cgen.Value("object", "sort_by_keys_zip"),
                    [
                        cgen.Value("object", "gpu_array_keys"),
                        cgen.Value("int", "n_vals"),
                        cgen.Value("object", "gpu_array_valsa"),
                        cgen.Value("object", "gpu_array_valsb"),
                    ],
                ),
                cgen.Block(
                    [cgen.Statement(x) for x in host_statements_sort_zip]
                ),
            )
        )

        host_statements_sort_zip_cmplex = [
            # Extract information from PyCUDA GPUArray
            # Get length
            # 'tuple shape = extract<tuple>(gpu_array_keys.attr("shape"))',
            "int length = n_vals",
            # 'int length = extract<int>(shape[0])',
            # Get data pointer
            """CUdeviceptr ptr_keys_a = """
            """extract<CUdeviceptr>(gpu_array_keys_a.attr("gpudata"))""",
            """CUdeviceptr ptr_keys_b = """
            """extract<CUdeviceptr>(gpu_array_keys_b.attr("gpudata"))""",
            """CUdeviceptr ptr_vals = """
            """extract<CUdeviceptr>(gpu_array_vals.attr("gpudata"))""",
            # Call Thrust routine, compiled into the CudaModule
            "my_sort_zip_cmplex(ptr_keys_a, ptr_keys_b, length, ptr_vals)",
            # Return result
            "return gpu_array_keys_a",
        ]
        host_mod.add_function(
            cgen.FunctionBody(
                cgen.FunctionDeclaration(
                    cgen.Value("object", "sort_by_keys_zip_cmplex"),
                    [
                        cgen.Value("object", "gpu_array_keys_a"),
                        cgen.Value("object", "gpu_array_keys_b"),
                        cgen.Value("int", "n_vals"),
                        cgen.Value("object", "gpu_array_vals"),
                    ],
                ),
                cgen.Block(
                    [
                        cgen.Statement(x)
                        for x in host_statements_sort_zip_cmplex
                    ]
                ),
            )
        )

        host_statements_sort_simple = [
            # Extract information from PyCUDA GPUArray
            # Get length
            # 'tuple shape = extract<tuple>(gpu_array_keys.attr("shape"))',
            "int length = n_vals",
            # 'int length = extract<int>(shape[0])',
            # Get data pointer
            """CUdeviceptr ptr_keys = """
            """extract<CUdeviceptr>(gpu_array_keys.attr("gpudata"))""",
            """CUdeviceptr ptr_vals = """
            """extract<CUdeviceptr>(gpu_array_vals.attr("gpudata"))""",
            # Call Thrust routine, compiled into the CudaModule
            "my_sort_simple(ptr_keys, length, ptr_vals)",
            # Return result
            "return gpu_array_keys",
        ]
        host_mod.add_function(
            cgen.FunctionBody(
                cgen.FunctionDeclaration(
                    cgen.Value("object", "sort_by_keys_simple"),
                    [
                        cgen.Value("object", "gpu_array_keys"),
                        cgen.Value("int", "n_vals"),
                        cgen.Value("object", "gpu_array_vals"),
                    ],
                ),
                cgen.Block(
                    [cgen.Statement(x) for x in host_statements_sort_simple]
                ),
            )
        )

        host_statements_prefix_sum = [
            # Extract information from PyCUDA GPUArray
            # Get length
            "int length = n_vals",
            # Get data pointer
            """CUdeviceptr ptr_vals = """
            """extract<CUdeviceptr>(gpu_array_vals.attr("gpudata"))""",
            # Call Thrust routine, compiled into the CudaModule
            "my_prefix_sum(ptr_vals, length)",
            # Return result
            "return gpu_array_vals",
        ]
        host_mod.add_function(
            cgen.FunctionBody(
                cgen.FunctionDeclaration(
                    cgen.Value("object", "prefix_sum"),
                    [
                        cgen.Value("object", "gpu_array_vals"),
                        cgen.Value("int", "n_vals"),
                    ],
                ),
                cgen.Block(
                    [cgen.Statement(x) for x in host_statements_prefix_sum]
                ),
            )
        )

        # Print out generated code, to see what we're actually compiling
        # print("---------------------- Host code ----------------------")
        # print(host_mod.generate())
        # print("--------------------- Device code ---------------------")
        # print(nvcc_mod.generate())
        # print("-------------------------------------------------------")

        # Compile modules

        gcc_toolchain = codepy.toolchain.guess_toolchain()
        nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain()

        self.thrust_module = nvcc_mod.compile(
            gcc_toolchain, nvcc_toolchain, debug=True
        )

[docs]    def create_gpu_struct(self, data):
        if data is None:
            gpu_vect = GPUStruct(
                [
                    (
                        np.int32,
                        "*pos",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*sub_pos",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*id_c",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*start_bp",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*len_bp",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*sub_len",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*circ",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*id",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*prev",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*next",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*l_cont",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*sub_l_cont",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*l_cont_bp",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*ori",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*rep",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*activ",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                    (
                        np.int32,
                        "*id_d",
                        np.zeros((self.n_new_frags,), dtype=np.int32),
                    ),
                ]
            )

        else:
            gpu_vect = GPUStruct(
                [
                    (np.int32, "*pos", np.copy(data["pos"])),
                    (np.int32, "*sub_pos", np.copy(data["sub_pos"])),
                    (np.int32, "*id_c", np.copy(data["id_c"])),
                    (np.int32, "*start_bp", np.copy(data["start_bp"])),
                    (np.int32, "*len_bp", np.copy(data["len_bp"])),
                    (np.int32, "*sub_len", np.copy(data["sub_len"])),
                    (np.int32, "*circ", np.copy(data["circ"])),
                    (np.int32, "*id", np.copy(data["id"])),
                    (np.int32, "*prev", np.copy(data["prev"])),
                    (np.int32, "*next", np.copy(data["next"])),
                    (np.int32, "*l_cont", np.copy(data["l_cont"])),
                    (np.int32, "*sub_l_cont", np.copy(data["sub_l_cont"])),
                    (np.int32, "*l_cont_bp", np.copy(data["l_cont_bp"])),
                    (
                        np.int32,
                        "*ori",
                        np.ones((self.n_new_frags,), dtype=np.int32),
                    ),
                    (np.int32, "*rep", np.copy(data["rep"])),
                    (np.int32, "*activ", np.copy(data["activ"])),
                    (np.int32, "*id_d", np.copy(data["id_d"])),
                ]
            )

        gpu_vect.copy_to_gpu()
        return gpu_vect

[docs]    def sparse_data_2_gpu(self,):

        # extract matrix withtout repeats
        id_single = []
        self.sym_sub_sampled_sparse_matrix = (
            self.sub_sampled_sparse_matrix + self.sub_sampled_sparse_matrix.T
        )
        for i in range(0, self.init_n_sub_frags):
            if i not in self.sub_candidates_dup:
                id_single.append(i)

        self.id_single = np.array(id_single, dtype=np.int32)
        self.id_rep = np.array(self.sub_candidates_dup, dtype=np.int32)
        mat_csr = self.sparse_matrix.tocsr()
        tmp_mat_1 = mat_csr[self.id_single, :]
        tmp_mat_2 = tmp_mat_1.tocsc()
        tmp_mat_3 = tmp_mat_2[:, self.id_single]
        self.mat_without_repeats = tmp_mat_3.tocsr()  # single Vs single

        self.hay_repeats = len(self.sub_candidates_dup) > 0
        if self.hay_repeats:
            self.mat_with_repeats = self.sparse_matrix[
                self.sub_candidates_dup, :
            ]  # repeats Vs all
        # HERE WE GO !!!!!
        total_mem_sparse = 0
        self.sub_sparse_sorted_data = []
        self.sub_sparse_sorted_ind = []
        self.nnz = self.sparse_matrix.data.shape[0]
        for i in range(0, self.sub_sampled_sparse_matrix.shape[0]):
            s0 = self.sub_sampled_sparse_matrix.indptr[i]
            s1 = self.sub_sampled_sparse_matrix.indptr[i + 1]
            loc_ind = np.copy(self.sub_sampled_sparse_matrix.indices[s0:s1])
            loc_data = np.copy(self.sub_sampled_sparse_matrix.data[s0:s1])
            id_sort = np.argsort(loc_data)
            data_2_push = list(loc_data[id_sort])
            data_2_push.reverse()
            ind_2_push = list(loc_ind[id_sort])
            ind_2_push.reverse()

            self.sub_sparse_sorted_data.extend(data_2_push)
            self.sub_sparse_sorted_ind.extend(ind_2_push)

        self.sparse_matrix_coo = self.sparse_matrix.tocoo()
        self.mat_without_repeats_coo_tmp = self.mat_without_repeats.tocoo()
        self.mat_without_repeats_coo = scp.sparse.triu(
            self.mat_without_repeats_coo_tmp, k=1, format="coo"
        )

        if self.hay_repeats:
            self.mat_with_repeats_coo_tmp = self.mat_with_repeats.tocoo()
            self.mat_with_repeats_coo = scp.sparse.triu(
                self.mat_with_repeats_coo_tmp, k=1, format="coo"
            )
        # self.gpu_sp_data = ga.to_gpu(ary=self.sparse_matrix.data)
        # self.gpu_sp_indptr = ga.to_gpu(ary=self.sparse_matrix.indptr)
        # self.gpu_sp_rows = ga.to_gpu(ary=self.sparse_matrix_coo.row)
        # self.gpu_sp_cols = ga.to_gpu(ary=self.sparse_matrix_coo.col)

        self.gpu_id_single = ga.to_gpu(ary=self.id_single)
        self.gpu_sp_no_rep_data = ga.to_gpu(
            ary=self.mat_without_repeats_coo.data
        )
        # self.gpu_sp_no_rep_indptr =
        # ga.to_gpu(ary=self.mat_without_repeats.indptr)
        self.gpu_sp_no_rep_rows = ga.to_gpu(
            ary=self.mat_without_repeats_coo.row
        )
        self.gpu_sp_no_rep_cols = ga.to_gpu(
            ary=self.mat_without_repeats_coo.col
        )

        self.gpu_sub_sp_no_rep_data = ga.to_gpu(
            ary=np.zeros_like(self.mat_without_repeats_coo.data)
        )
        self.gpu_sub_sp_no_rep_rows = ga.to_gpu(
            ary=np.zeros_like(self.mat_without_repeats_coo.data)
        )
        self.gpu_sub_sp_no_rep_cols = ga.to_gpu(
            ary=np.zeros_like(self.mat_without_repeats_coo.data)
        )

        self.n_non_zero = self.mat_without_repeats_coo.data.shape[0]
        self.gpu_collect_frags_4_sp = ga.to_gpu(
            ary=np.zeros((self.id_single.shape[0] + 1,), dtype=np.int32)
        )

        if self.hay_repeats:
            self.gpu_id_ok_rep = ga.to_gpu(ary=self.id_rep)
            self.gpu_sp_rep_data = ga.to_gpu(ary=self.mat_with_repeats.data)
            self.gpu_sp_rep_indptr = ga.to_gpu(
                ary=self.mat_with_repeats.indptr
            )
            self.gpu_sp_rep_rows = ga.to_gpu(ary=self.mat_with_repeats_coo.row)
            self.gpu_sp_rep_cols = ga.to_gpu(ary=self.mat_with_repeats_coo.col)

        # self.gpu_sp_n_indices = np.int32(self.sparse_matrix.indices.shape[0])

        total_mem_sparse = (
            self.sparse_matrix.data.nbytes
            + self.sparse_matrix.indptr.nbytes
            + self.sparse_matrix_coo.row.nbytes
            + self.sparse_matrix_coo.col.nbytes
        )

        # self.gpu_sub_sp_data =
        # ga.to_gpu(ary=self.sub_sampled_sparse_matrix.data)
        # self.gpu_sub_sp_indptr =
        # ga.to_gpu(ary=self.sub_sampled_sparse_matrix.indptr)
        # self.gpu_sub_sp_indices =
        # ga.to_gpu(ary=self.sub_sampled_sparse_matrix.indices)
        # self.gpu_sub_sp_n_indices =
        # np.int32(self.sub_sampled_sparse_matrix.indices.shape[0])

        # total_mem_sparse += self.sub_sampled_sparse_matrix.data.nbytes +
        # self.sub_sampled_sparse_matrix.indptr.nbytes + \
        #                     self.sub_sampled_sparse_matrix.indices.nbytes
        logger.info(
            "total mem used by sparse data = {}".format(
                np.float32(total_mem_sparse) / 10 ** 6.
            )
        )

[docs]    def sparse_data_4_gl(self, precision):
        # create sparse data 4 opengl purposes. take only value above limit
        # self.sub_mat = self.sub_sampled_sparse_matrix.tocoo()
        self.sub_mat = scp.sparse.triu(
            self.sub_sampled_sparse_matrix, k=1, format="coo"
        )
        rows = self.sub_mat.row
        cols = self.sub_mat.col
        data = self.sub_mat.data
        id = np.nonzero(data > precision)[0]
        self.n_data_4_gl = len(id)
        self.mat_4_gl = scp.sparse.coo_matrix(
            (data[id], (rows[id], cols[id])), shape=self.sub_mat.shape
        )
        self.gpu_rows_4_gl = ga.to_gpu(ary=self.mat_4_gl.row)
        self.gpu_cols_4_gl = ga.to_gpu(ary=self.mat_4_gl.col)
        self.gpu_data_4_gl = ga.to_gpu(ary=self.mat_4_gl.data)
        self.gpu_ptr_4_gl = ga.zeros_like(self.gpu_data_4_gl)
        self.gpu_counter_select_4_gl = ga.zeros((1,), dtype=np.int32)
        self.gpu_vect_gl_pxl_frag = ga.zeros(
            (self.n_new_frags,), dtype=np.int32
        )

[docs]    def dist_inter_genome(self, tmp_gpu_vect_frags):
        tmp_gpu_vect_frags.copy_from_gpu()
        g1 = tmp_gpu_vect_frags
        n_frags_blacklisted = len(self.id_frags_blacklisted)
        d = 3.0 * (self.n_new_frags - n_frags_blacklisted)
        norm_distance = 3.0 * (self.n_new_frags - n_frags_blacklisted)
        # d = 3.0 * self.n_new_frags
        for id_f in range(0, self.n_new_frags):
            if id_f not in self.id_frags_blacklisted:
                prev_t0 = self.np_init_prev[id_f]
                prev_t1 = g1.prev[id_f]
                next_t0 = self.np_init_next[id_f]
                next_t1 = g1.next[id_f]
                ori_t0 = self.np_init_ori[id_f]
                ori_t1 = g1.ori[id_f]
                swap = 1
                if ((prev_t1 == prev_t0) and (next_t1 == next_t0)) or (
                    (prev_t1 == next_t0) and (next_t1 == prev_t0)
                ):
                    d -= 1
                    # if not(self.np_init_orientable[id_f]):
                    #     d -= 2
                if self.np_init_orientable[id_f]:
                    if ori_t0 != ori_t1:
                        tmp = prev_t1
                        prev_t1 = next_t1
                        next_t1 = tmp
                        swap = -1
                    if prev_t0 == prev_t1:
                        if prev_t0 == -1:
                            d -= 1
                        elif not (self.np_init_orientable[prev_t1]):
                            d -= 1
                        else:
                            d -= 0.5
                            ori_prev_t0 = self.np_init_ori[prev_t0]
                            ori_prev_t1 = g1.ori[prev_t1]
                            if ori_prev_t0 == swap * ori_prev_t1:
                                d -= 0.5
                    if next_t0 == next_t1:
                        if next_t0 == -1:
                            d -= 1
                        elif not (self.np_init_orientable[next_t1]):
                            d -= 1
                        else:
                            d -= 0.5
                            ori_next_t0 = self.np_init_ori[next_t0]
                            ori_next_t1 = g1.ori[next_t1]
                            if ori_next_t0 == swap * ori_next_t1:
                                d -= 0.5
                else:
                    if (prev_t1 == prev_t0) or (prev_t1 == next_t0):
                        d -= 1
                    if (next_t1 == next_t0) or (next_t1 == prev_t0):
                        d -= 1
        return d / norm_distance

[docs]    def approx_single_likelihood_on_zeros(self,):

        start = cuda.Event()
        end = cuda.Event()
        stride = 1

        size_block = 1024
        block_ = (size_block, 1, 1)
        n_blocks = int(int(self.init_n_sub_frags) / size_block + 1)
        grid_ = (max(1, int(n_blocks // stride)), 1)
        # print "block = ", block_
        # print "grid_all = ", grid_
        self.gpu_likelihood_on_zeros.fill(0)
        self.gpu_n_vals_intra.fill(0)
        self.ctx.synchronize()

        start.record()
        self.kern_eval_likelihood_zeros(
            self.gpu_vect_id_c,
            self.gpu_vect_s_tot,
            self.gpu_vect_pos,
            self.gpu_vect_len,
            self.gpu_param_simu,
            np.int32(self.mean_len_bp_frags / 1000.),
            self.gpu_likelihood_on_zeros,
            self.gpu_n_vals_intra,
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()

        # print "CUDA clock execution timing (compute likelihood on zeros): ",
        # secs
        self.val_on_zero_intra = (
            self.gpu_likelihood_on_zeros.get()[0] * self.log_e
        )
        self.n_vals_intra = self.gpu_n_vals_intra.get()[0]

        self.val_on_zero_inter = (
            self.log_e
            * (self.n_pixl_sub_mat - self.n_vals_intra)
            * -1.0
            * self.param_simu["v_inter"][0]
        )
        self.curr_likelihood_on_z = (
            self.val_on_zero_intra + self.val_on_zero_inter
        )
        # print "GPU execution time ( approx on zeros single) = ", t1 - t0

[docs]    def approx_single_likelihood_on_zeros_nuisance(self,):

        start = cuda.Event()
        end = cuda.Event()
        stride = 1

        size_block = 1024
        block_ = (size_block, 1, 1)
        n_blocks = int(int(self.init_n_sub_frags) / size_block + 1)
        grid_ = int(max(1, n_blocks / stride)), 1
        # print "block = ", block_
        # print "grid_all = ", grid_
        self.gpu_likelihood_on_zeros_nuis.fill(0)
        self.gpu_n_vals_intra.fill(0)
        start.record()
        self.kern_eval_likelihood_zeros(
            self.gpu_vect_id_c,
            self.gpu_vect_s_tot,
            self.gpu_vect_pos,
            self.gpu_vect_len,
            self.gpu_param_simu_test,
            np.float32(self.mean_len_bp_frags / 1000.),
            self.gpu_likelihood_on_zeros_nuis,
            self.gpu_n_vals_intra,
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()
        # print "CUDA clock execution timing (compute likelihood on zeros): ",
        # secs
        self.val_on_zero_intra_nuis = (
            self.gpu_likelihood_on_zeros_nuis.get()[0] * self.log_e
        )
        self.n_vals_intra = self.gpu_n_vals_intra.get()[0]
        self.val_on_zero_inter_nuis = (
            self.log_e
            * (self.n_pixl_sub_mat - self.n_vals_intra)
            * -1.0
            * self.param_simu_test["v_inter"][0]
        )
        self.curr_likelihood_on_z_nuis = (
            self.val_on_zero_intra_nuis + self.val_on_zero_inter_nuis
        )
        # print "GPU execution time ( approx on zeros single) = ", t1 - t0

[docs]    def approx_single_likelihood_on_zeros_mut(self,):

        start = cuda.Event()
        end = cuda.Event()
        stride = 1

        size_block = 1024
        block_ = (size_block, 1, 1)
        n_blocks = int(self.init_n_sub_frags) / size_block + 1
        grid_ = (max(1, n_blocks / stride), 1)
        # print "block = ", block_
        # print "grid_all = ", grid_
        self.gpu_likelihood_on_zeros.fill(0)
        self.gpu_n_vals_intra.fill(0)
        self.ctx.synchronize()

        start.record()
        self.kern_eval_likelihood_zeros(
            self.gpu_vect_id_c_mut,
            self.gpu_vect_s_tot_mut,
            self.gpu_vect_pos_mut,
            self.gpu_vect_len_mut,
            self.gpu_param_simu,
            np.float32(self.mean_len_bp_frags / 1000.),
            self.gpu_likelihood_on_zeros,
            self.gpu_n_vals_intra,
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()

        # print "CUDA clock execution timing (compute likelihood on zeros): ",
        # secs
        self.val_on_zero_intra_mut = (
            self.gpu_likelihood_on_zeros.get()[0] * self.log_e
        )
        self.n_vals_intra = self.gpu_n_vals_intra.get()[0]
        self.val_on_zero_inter_mut = (
            self.log_e
            * (self.n_pixl_sub_mat - self.n_vals_intra)
            * -1.0
            * self.param_simu["v_inter"][0]
        )
        self.curr_likelihood_on_z_mut = (
            self.val_on_zero_intra_mut + self.val_on_zero_inter_mut
        )
        # self.curr_likelihood_on_z_mut = self.val_on_zero_inter_mut
        # print "GPU execution time ( approx on zeros single) = ", t1 - t0

[docs]    def approx_all_likelihood_on_zeros(self,):

        start = cuda.Event()
        end = cuda.Event()
        stride = 1

        size_block = 1024
        block_ = (size_block, 1, 1)
        n_blocks = int(int(self.init_n_sub_frags) / size_block + 1)
        grid_ = (int(max(1, n_blocks / stride)), 1)

        self.gpu_vect_likelihood_z.fill(0)
        self.gpu_all_n_vals_intra.fill(0)
        self.ctx.synchronize()

        start.record()
        self.kern_eval_all_likelihood_zeros_1st(
            self.collect_gpu_vect_id_c,
            self.collect_gpu_vect_s_tot,
            self.collect_gpu_vect_pos,
            self.collect_gpu_vect_len,
            self.gpu_param_simu,
            np.float32(self.mean_len_bp_frags / 1000.),
            self.gpu_list_uniq_mutations,
            self.gpu_n_uniq,
            self.gpu_vect_likelihood_z,
            self.gpu_all_n_vals_intra,
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()

        start.record()
        self.kern_eval_all_likelihood_zeros_2nd(
            self.gpu_list_uniq_mutations,
            self.gpu_n_uniq,
            self.gpu_param_simu,
            self.gpu_vect_likelihood_z,
            self.gpu_all_n_vals_intra,
            self.gpu_n_pixl_sub_mat,
            block=(self.n_threads_mutations, 1, 1),
            grid=(1, 1),
        )
        end.record()
        end.synchronize()

[docs]    def fill_dist_all_mut(self,):
        start = cuda.Event()
        end = cuda.Event()
        size_block = 1024
        block_ = (size_block, 1, 1)
        size_all_data = int(self.init_n_sub_frags)
        n_block = size_all_data // (size_block) + 1
        size_grid = max(int(n_block), 1)
        grid_all = (size_grid, 1)

        for id_mut in range(0, self.n_tmp_struct):
            start.record()
            self.kern_fill_vect_dist_all(
                self.gpu_sub_frags_2_frags,
                self.collector_gpu_vect_frags[id_mut].get_ptr(),
                self.collect_gpu_vect_dist,
                self.collect_gpu_vect_id_c,
                self.collect_gpu_vect_s_tot,
                self.collect_gpu_vect_pos,
                self.collect_gpu_vect_len,
                self.gpu_collector_id_repeats,
                self.gpu_frag_dispatcher,
                self.gpu_sub_collector_id_repeats,
                self.gpu_sub_frag_dispatcher,
                np.int32(self.init_n_sub_frags),
                np.int32(id_mut),
                block=block_,
                grid=grid_all,
            )

            end.record()
            end.synchronize()
            # print "CUDA clock execution timing (compute sub vect distances):
            # ", secs print "Total time vect frags to sub frags = ",
            # time.time() - t0

[docs]    def fill_dist_single(self,):

        start = cuda.Event()
        end = cuda.Event()
        size_block = 1024
        block_ = (size_block, 1, 1)
        size_all_data = int(self.init_n_sub_frags)
        n_block = size_all_data // (size_block) + 1
        size_grid = max(int(n_block / 1), 1)
        grid_all = (size_grid, 1)

        start.record()
        self.kern_fill_vect_dist_single(
            self.gpu_sub_frags_2_frags,
            self.gpu_vect_frags.get_ptr(),
            self.gpu_vect_dist,
            self.gpu_vect_id_c,
            self.gpu_vect_s_tot,
            self.gpu_vect_pos,
            self.gpu_vect_len,
            self.gpu_collector_id_repeats,
            self.gpu_frag_dispatcher,
            self.gpu_sub_collector_id_repeats,
            self.gpu_sub_frag_dispatcher,
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()
        # print "CUDA clock execution timing (compute sub vect distances): ",
        # secs
        # print "Total time vect frags to sub frags = ", time.time() - t0

[docs]    def fill_dist_single_mut(self, id_mut):

        start = cuda.Event()
        end = cuda.Event()
        size_block = 1024
        block_ = (size_block, 1, 1)
        size_all_data = int(self.init_n_sub_frags)
        n_block = size_all_data // (size_block) + 1
        size_grid = max(int(n_block / 1), 1)
        grid_all = (size_grid, 1)

        if id_mut == -1:
            vect_frags = self.gpu_vect_frags
        else:
            vect_frags = self.collector_gpu_vect_frags[id_mut]
        start.record()
        self.kern_fill_vect_dist_single(
            self.gpu_sub_frags_2_frags,
            vect_frags.get_ptr(),
            self.gpu_vect_dist_mut,
            self.gpu_vect_id_c_mut,
            self.gpu_vect_s_tot_mut,
            self.gpu_vect_pos_mut,
            self.gpu_vect_len_mut,
            self.gpu_collector_id_repeats,
            self.gpu_frag_dispatcher,
            self.gpu_sub_collector_id_repeats,
            self.gpu_sub_frag_dispatcher,
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()
        # print "CUDA clock execution timing (compute sub vect distances): ",
        # secs print "Total time vect frags to sub frags = ", time.time() - t0

[docs]    def slice_sparse_mat(self, id_ctg1, id_ctg2, id_fragA, id_fragB):

        size_block = self.size_block_4_sub
        block_ = (size_block, 1, 1)
        start = cuda.Event()
        end = cuda.Event()

        grid_ = (int(self.n_non_zero // size_block + 1), 1)
        # print "grid = ", grid_
        self.gpu_counter_select.fill(0)
        self.ctx.synchronize()
        start.record()
        self.kern_slice_sp_mat(
            self.gpu_sp_no_rep_data,
            self.gpu_sp_no_rep_rows,
            self.gpu_sp_no_rep_cols,
            self.gpu_vect_frags.get_ptr(),
            self.gpu_vect_id_c,
            self.gpu_vect_pos,
            self.gpu_sub_sp_no_rep_rows,
            self.gpu_sub_sp_no_rep_cols,
            self.gpu_sub_sp_no_rep_data,
            np.int32(id_ctg1),
            np.int32(id_ctg2),
            np.int32(id_fragA),
            np.int32(id_fragB),
            np.int32(self.max_bounds_insert),
            self.gpu_counter_select,
            np.int32(self.n_non_zero),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()
        self.n_sub_vals = self.gpu_counter_select.get()[0]

        self.thrust_module.sort_by_keys_zip(
            self.gpu_sub_sp_no_rep_rows,
            int(self.n_sub_vals),
            self.gpu_sub_sp_no_rep_cols,
            self.gpu_sub_sp_no_rep_data,
        )

        n_blocks = int(self.n_sub_vals // self.size_block_4_sub + 1)
        size_grid = max(int(n_blocks), 1)
        grid_all = (size_grid, 1)
        self.gpu_counter_select.fill(0)
        self.ctx.synchronize()

        start.record()
        self.kern_prepare_sparse_call(
            self.gpu_sub_sp_no_rep_rows,
            self.gpu_info_blocks,
            self.gpu_sub_sp_block_indptr,
            self.gpu_counter_select,
            np.int32(self.n_sub_vals),
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()

        # print "GPU all splice: elapsed time  = ", t_end - t_start print "GPU
        # Thrust sort: elapsed time  = ", t_thrust_1 - t_thrust_0 print "GPU
        # splice init : elapsed time = ", elapsed_seconds_splice print "GPU
        # splice prepare call : elapsed time = ", elapsed_seconds_prepare

[docs]    def show_sub_slice(self, id_ctg1, id_ctg2, id_frag_a, id_frag_b):
        self.gpu_sub_sp_no_rep_data.fill(0)
        self.gpu_sub_sp_no_rep_rows.fill(0)
        self.gpu_sub_sp_no_rep_cols.fill(0)
        self.slice_sparse_mat(id_ctg1, id_ctg2, id_frag_a, id_frag_b)
        dat = self.gpu_sub_sp_no_rep_data.get()[: self.n_sub_vals]
        rows = self.gpu_sub_sp_no_rep_rows.get()[: self.n_sub_vals]
        cols = self.gpu_sub_sp_no_rep_cols.get()[: self.n_sub_vals]

        self.np_sub_mat = scp.sparse.coo_matrix(
            (dat, (rows, cols)),
            shape=(self.init_n_sub_frags, self.init_n_sub_frags),
        )
        plt.spy(self.np_sub_mat, markersize=0.1)
        plt.show()

[docs]    def eval_all_sub_likelihood(self,):

        self.fill_dist_all_mut()

        self.approx_all_likelihood_on_zeros()

        block_ = (self.size_block_4_sub, 1, 1)

        start = cuda.Event()
        end = cuda.Event()

        n_blocks = int(self.n_sub_vals // self.size_block_4_sub + 1)
        size_grid = max(int(n_blocks), 1)
        grid_all = (size_grid, 1)
        self.gpu_sub_vect_likelihood_nz.fill(0.0)
        self.gpu_all_scores.fill(0.0)
        self.ctx.synchronize()

        start.record()
        self.kern_sub_likelihood(
            self.gpu_sub_sp_no_rep_data,
            self.gpu_info_blocks,
            self.gpu_sub_sp_block_indptr,
            self.gpu_sub_sp_no_rep_rows,
            self.gpu_sub_sp_no_rep_cols,
            self.gpu_param_simu,
            np.float32(self.mean_len_bp_frags / 1000.0),
            self.collect_gpu_vect_dist,
            self.collect_gpu_vect_id_c,
            self.collect_gpu_vect_s_tot,
            self.collect_gpu_vect_pos,
            self.collect_gpu_vect_len,
            self.gpu_list_uniq_mutations,
            self.gpu_n_uniq,
            self.gpu_sub_vect_likelihood_nz,
            np.int32(self.n_sub_vals),
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()
        # print "CUDA clock execution timing(sub likelihood computing): ", secs
        start.record()
        self.kern_eval_all_scores(
            self.gpu_list_uniq_mutations,
            self.gpu_n_uniq,
            self.gpu_vect_likelihood_z,
            self.gpu_sub_vect_likelihood_nz,
            self.gpu_curr_likelihood_nz_extract,
            self.gpu_curr_likelihood_nz,
            self.gpu_all_scores,
            block=(32, 1, 1),
            grid=(1, 1),
        )
        end.record()
        end.synchronize()

        score = self.gpu_all_scores.get()

        return score

[docs]    def extract_current_sub_likelihood(self):

        block_ = (self.size_block_4_sub, 1, 1)

        start = cuda.Event()
        end = cuda.Event()
        n_blocks = int(self.n_sub_vals // self.size_block_4_sub + 1)
        size_grid = max(int(n_blocks), 1)
        grid_all = (size_grid, 1)
        self.gpu_curr_likelihood_nz_extract.fill(0.0)
        self.ctx.synchronize()
        start.record()
        self.kern_extract_sub_likelihood(
            self.gpu_sub_sp_no_rep_data,
            self.gpu_info_blocks,
            self.gpu_sub_sp_block_indptr,
            self.gpu_sub_sp_no_rep_rows,
            self.gpu_sub_sp_no_rep_cols,
            self.gpu_param_simu,
            np.float32(self.mean_len_bp_frags / 1000.0),
            self.gpu_vect_dist,
            self.gpu_vect_id_c,
            self.gpu_vect_s_tot,
            self.gpu_vect_pos,
            self.gpu_vect_len,
            self.gpu_curr_likelihood_nz_extract,
            np.int32(self.n_sub_vals),
            np.int32(self.init_n_sub_frags),
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()
        # print "CUDA clock execution timing(extract sub likelihood computing):
        # ", secs
        self.likelihood_extracted_nz = (
            self.gpu_curr_likelihood_nz_extract.get()
        )

[docs]    def eval_likelihood_init(self,):

        start = cuda.Event()
        end = cuda.Event()

        self.fill_dist_single()

        pct_size = 1
        stride = 1
        size_block = 1024
        block_ = (size_block, 1, 1)
        # pct_size = 1
        size_all_data = int(self.gpu_sp_no_rep_data.shape[0] * pct_size)
        # print "size all data = ", size_all_data
        n_block = size_all_data // (size_block) + 1
        size_grid = max(int(n_block / stride), 1)
        grid_all = (size_grid, 1)
        # print "block = ", block_
        # print "grid_all = ", grid_all

        self.approx_single_likelihood_on_zeros()
        self.gpu_curr_likelihood_nz.fill(0.0)
        self.ctx.synchronize()

        start.record()
        self.kern_evaluate_likelihood_single(
            self.gpu_sp_no_rep_data,
            self.gpu_sp_no_rep_rows,
            self.gpu_sp_no_rep_cols,
            self.gpu_id_single,
            self.gpu_param_simu,
            np.float32(self.mean_len_bp_frags / 1000.0),
            self.gpu_vect_dist,
            self.gpu_vect_id_c,
            self.gpu_vect_s_tot,
            self.gpu_vect_pos,
            self.gpu_vect_len,
            self.gpu_curr_likelihood_nz,
            np.int32(size_all_data),
            self.n_single_frags,
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()

        likelihood_on_nz = self.gpu_curr_likelihood_nz.get()
        self.curr_likelihood_on_nz = likelihood_on_nz
        self.likelihood_t = likelihood_on_nz + self.curr_likelihood_on_z

[docs]    def eval_likelihood(self,):

        start = cuda.Event()
        end = cuda.Event()

        self.fill_dist_single()

        pct_size = 1
        stride = 1
        size_block = 1024
        block_ = (size_block, 1, 1)
        # pct_size = 1
        size_all_data = int(self.gpu_sp_no_rep_data.shape[0] * pct_size)
        # print "size all data = ", size_all_data
        n_block = size_all_data // (size_block) + 1
        size_grid = max(int(n_block / stride), 1)
        grid_all = (size_grid, 1)
        # print "block = ", block_
        # print "grid_all = ", grid_all

        self.approx_single_likelihood_on_zeros()

        self.gpu_curr_likelihood_nz.fill(0.0)
        self.ctx.synchronize()

        start.record()
        self.kern_evaluate_likelihood_single(
            self.gpu_sp_no_rep_data,
            self.gpu_sp_no_rep_rows,
            self.gpu_sp_no_rep_cols,
            self.gpu_id_single,
            self.gpu_param_simu,
            np.float32(self.mean_len_bp_frags / 1000.0),
            self.gpu_vect_dist,
            self.gpu_vect_id_c,
            self.gpu_vect_s_tot,
            self.gpu_vect_pos,
            self.gpu_vect_len,
            self.gpu_curr_likelihood_nz,
            np.int32(size_all_data),
            self.n_single_frags,
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()
        # likelihood_on_nz = self.gpu_curr_likelihood_nz.get()
        # self.curr_likelihood_on_nz = likelihood_on_nz
        # self.curr_likelihood = likelihood_on_nz + self.curr_likelihood_on_z
        # print "GPU time likelihood ALL = ", time.time() - t1
        # print "likelihood on nz= ", likelihood_on_nz
        # return self.curr_likelihood_on_nz, self.curr_likelihood_on_z

[docs]    def eval_likelihood_4_nuisance(self,):
        start = cuda.Event()
        end = cuda.Event()

        pct_size = 1
        stride = 1
        size_block = 1024
        block_ = (size_block, 1, 1)
        # pct_size = 1
        size_all_data = int(self.gpu_sp_no_rep_data.shape[0] * pct_size)
        # print "size all data = ", size_all_data
        n_block = size_all_data // (size_block) + 1
        size_grid = max(int(n_block / stride), 1)
        grid_all = (size_grid, 1)
        # print "block = ", block_
        # print "grid_all = ", grid_all

        self.approx_single_likelihood_on_zeros_nuisance()

        self.gpu_curr_likelihood_nz_nuis.fill(0.0)
        start.record()
        self.kern_evaluate_likelihood_single(
            self.gpu_sp_no_rep_data,
            self.gpu_sp_no_rep_rows,
            self.gpu_sp_no_rep_cols,
            self.gpu_id_single,
            self.gpu_param_simu_test,
            np.float32(self.mean_len_bp_frags / 1000.0),
            self.gpu_vect_dist,
            self.gpu_vect_id_c,
            self.gpu_vect_s_tot,
            self.gpu_vect_pos,
            self.gpu_vect_len,
            self.gpu_curr_likelihood_nz_nuis,
            np.int32(size_all_data),
            self.n_single_frags,
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()
        likelihood_on_nz_nuis = self.gpu_curr_likelihood_nz_nuis.get()
        # self.curr_likelihood_on_nz = likelihood_on_nz
        self.curr_likelihood_nuis = (
            likelihood_on_nz_nuis + self.curr_likelihood_on_z_nuis
        )
        # print "GPU time likelihood ALL = ", time.time() - t1
        # print "likelihood on nz= ", likelihood_on_nz
        return self.curr_likelihood_nuis

[docs]    def eval_likelihood_on_mut(self, id_mut):

        start = cuda.Event()
        end = cuda.Event()
        self.fill_dist_single_mut(id_mut)
        self.approx_single_likelihood_on_zeros_mut()

        pct_size = 1
        stride = 1
        size_block = 1024
        block_ = (size_block, 1, 1)
        # pct_size = 1
        size_all_data = int(self.gpu_sp_no_rep_data.shape[0] * pct_size)
        # print "size all data = ", size_all_data
        n_block = size_all_data // (size_block) + 1
        size_grid = max(int(n_block / stride), 1)
        grid_all = (size_grid, 1)
        # print "block = ", block_
        # print "grid_all = ", grid_all

        self.gpu_val_likelihood_4_mut.fill(0.0)
        self.ctx.synchronize()

        start.record()
        self.kern_evaluate_likelihood_single(
            self.gpu_sp_no_rep_data,
            self.gpu_sp_no_rep_rows,
            self.gpu_sp_no_rep_cols,
            self.gpu_id_single,
            self.gpu_param_simu,
            np.float32(self.mean_len_bp_frags / 1000.0),
            self.gpu_vect_dist_mut,
            self.gpu_vect_id_c_mut,
            self.gpu_vect_s_tot_mut,
            self.gpu_vect_pos_mut,
            self.gpu_vect_len_mut,
            self.gpu_val_likelihood_4_mut,
            np.int32(size_all_data),
            self.n_single_frags,
            block=block_,
            grid=grid_all,
        )
        end.record()
        end.synchronize()

        # likelihood_on_nz = ga.sum(self.gpu_val_likelihood_4_mut,
        # dtype=np.float64).get()
        likelihood_on_nz = self.gpu_val_likelihood_4_mut.get()
        self.curr_likelihood_on_nz_mut = likelihood_on_nz
        # self.curr_likelihood_on_nz_mut = likelihood_on_nz

        # print "GPU time likelihood ALL = ", time.time() - t1
        # print "likelihood on nz= ", likelihood_on_nz
        return self.curr_likelihood_on_nz_mut + self.curr_likelihood_on_z_mut

[docs]    def step_sampler(self, id_frag, n_neighbours, dt):

        self.candidates = self.return_neighbours(id_frag, n_neighbours)
        self.candidates.sort()
        n = len(self.candidates)

        self.fill_dist_single()

        self.eval_likelihood()
        self.gpu_vect_frags.copy_from_gpu()
        id_c_prev_cand = -1
        id_ctg_a = self.gpu_vect_frags.id_c[id_frag]
        # print "id_contig a = ", id_ctg_a
        self.all_scores = np.zeros((self.n_tmp_struct * n), dtype=np.float64)
        max_id = self.modify_gl_cuda_buffer(id_frag, dt)
        flip_eject = 1
        for (id_cand, i) in zip(self.candidates, list(range(0, n))):
            self.gl_window.remote_update()
            self.extract_uniq_mutations(id_frag, id_cand, flip_eject)
            self.perform_mutations(id_frag, id_cand, max_id, 1 == 0)
            id_ctg_b = self.gpu_vect_frags.id_c[id_cand]
            if id_ctg_b != id_c_prev_cand:
                # print "id_contig b = ", id_ctg_b
                # print "slicing sparse matrix!"
                self.slice_sparse_mat(id_ctg_a, id_ctg_b, id_frag, id_cand)
                self.extract_current_sub_likelihood()
            else:
                self.slice_sparse_mat(id_ctg_a, id_ctg_b, id_frag, id_cand)
                self.extract_current_sub_likelihood()
            id_c_prev_cand = id_ctg_b
            self.all_scores[
                i * self.n_tmp_struct : (i + 1) * self.n_tmp_struct
            ] = self.eval_all_sub_likelihood()
            flip_eject = 0
        # print "done!"

        scores_ok = np.copy(self.all_scores)
        scores_ok[scores_ok == 0] = -np.inf
        max_score = scores_ok.max()
        thresh_overflow = 30
        # filtered_score[filtered_score < max_score - thresh] = 0
        filtered_score = scores_ok - (max_score - thresh_overflow)
        filtered_score[filtered_score < 0] = 0

        global_id = np.argmax(filtered_score)
        # print "global id = ", global_id
        id_f_sampled = self.candidates[int(global_id / self.n_tmp_struct)]
        op_sampled = global_id % self.n_tmp_struct

        # print "id frag sampled = ", id_f_sampled
        # print "operation sampled = ", self.modification_str[op_sampled]
        # print 'id operation =', op_sampled

        self.test_copy_struct(id_frag, id_f_sampled, op_sampled, max_id)
        self.modify_gl_cuda_buffer(id_frag, self.gl_window.dt)
        o = self.all_scores[global_id]
        self.o = o
        dist = self.dist_inter_genome(self.gpu_vect_frags)
        self.likelihood_t = o
        return (
            o,
            dist,
            op_sampled,
            id_f_sampled,
            self.mean_length_contigs,
            self.n_contigs,
        )

[docs]    def step_sampler_debug(self, id_frag, n_neighbours):

        # n_neighbours = 5000

        self.candidates = list(range(0, n_neighbours + 1))
        self.candidates.pop(id_frag)
        n = len(self.candidates)

        self.fill_dist_single()
        # curr_likelihood_on_nz, curr_likelihood_on_z = self.eval_likelihood()
        self.eval_likelihood()
        self.gpu_vect_frags.copy_from_gpu()
        id_c_prev_cand = -1
        id_ctg_a = self.gpu_vect_frags.id_c[id_frag]
        # print "id_contig a = ", id_ctg_a
        self.all_scores = np.zeros((self.n_tmp_struct * n), dtype=np.float64)
        max_id = self.modify_gl_cuda_buffer(id_frag, self.gl_window.dt)
        flip_eject = 1
        for (id_cand, i) in zip(self.candidates, list(range(0, n))):
            self.gl_window.remote_update()
            self.extract_uniq_mutations(id_frag, id_cand, flip_eject)
            self.perform_mutations(id_frag, id_cand, max_id, 1 == 0)
            id_ctg_b = self.gpu_vect_frags.id_c[id_cand]
            if id_ctg_b != id_c_prev_cand:
                # print "id_contig b = ", id_ctg_b
                # print "slicing sparse matrix!"
                self.slice_sparse_mat(id_ctg_a, id_ctg_b, id_frag, id_cand)
                self.extract_current_sub_likelihood()
            id_c_prev_cand = id_ctg_b
            self.all_scores[
                i * self.n_tmp_struct : (i + 1) * self.n_tmp_struct
            ] = self.eval_all_sub_likelihood()
            flip_eject = 0

[docs]    def extract_uniq_mutations(self, id_fi, id_fj, flip_eject):
        start = cuda.Event()
        end = cuda.Event()
        start.record()
        self.gpu_list_uniq_mutations.fill(0)
        self.ctx.synchronize()

        self.kern_uniq_mutations(
            self.gpu_vect_frags.get_ptr(),
            np.int32(id_fi),
            np.int32(id_fj),
            self.gpu_list_uniq_mutations,
            self.gpu_list_valid_insert,
            self.gpu_n_uniq,
            np.int32(flip_eject),
            block=(self.n_threads_mutations, 1, 1),
            grid=(1, 1),
        )
        end.record()
        end.synchronize()
        # print "CUDA clock execution timing(define uniq mutations): ", secs

[docs]    def loadProgram(self, filename):
        # read in the Cuda source file as a string
        f = open(filename, "r")
        raw_fstr = "".join(f.readlines())
        f.close()
        if self.active_insert_blocks:
            logger.info(
                "size array in shared memory = {}".format(
                    str(self.n_tmp_struct * self.size_block_4_sub)
                )
            )
            fstr = (
                raw_fstr.replace(
                    "N_STRUCT_BY_BLOCK_SIZE",
                    str(self.n_tmp_struct * self.size_block_4_sub),
                )
                .replace("SIZE_BLOCK_4_SUB_3", str(self.size_block_4_sub * 3))
                .replace("N_TMP_STRUCT", str(self.n_tmp_struct))
                .replace("SIZE_BLOCK_4_SUB", str(self.size_block_4_sub))
                .replace("N_TO_CUT", str(self.n_insert_blocks))
            )

        else:
            fstr = raw_fstr

        # create the program
        self.module = pycuda.compiler.SourceModule(
            fstr,
            no_extern_c=True,
            options=[
                # options=["--cubin", "-dc=true", "-lcudadevrt", "-m64",
                #          "--keep",
                # options=["--cubin","-lcudadevrt", "-m64",
            ],
            # options=["-lcudadevrt", "-m64"],
        )

        # self.sub_evaluate_likelihood_sparse =
        # self.module.get_function('sub_evaluate_likelihood_sparse')

        self.kern_fill_vect_dist_all = self.module.get_function(
            "fill_vect_dist"
        )
        self.kern_fill_vect_dist_single = self.module.get_function(
            "uni_fill_vect_dist"
        )
        self.init_rng = self.module.get_function("init_rng")
        self.kern_evaluate_likelihood_single = self.module.get_function(
            "evaluate_likelihood_sparse"
        )
        self.kern_sub_likelihood = self.module.get_function(
            "eval_sub_likelihood"
        )
        # self.kern_select_them = self.module.get_function('select_them')
        self.kern_slice_sp_mat = self.module.get_function("slice_sp_mat")
        self.kern_prepare_sparse_call = self.module.get_function(
            "prepare_sparse_call"
        )
        self.kern_eval_likelihood_zeros = self.module.get_function(
            "eval_likelihood_on_zero"
        )
        self.kern_eval_all_likelihood_zeros_1st = self.module.get_function(
            "eval_all_likelihood_on_zero_1st"
        )
        self.kern_eval_all_likelihood_zeros_2nd = self.module.get_function(
            "eval_all_likelihood_on_zero_2nd"
        )
        self.kern_extract_sub_likelihood = self.module.get_function(
            "extract_sub_likelihood"
        )
        self.kern_uniq_mutations = self.module.get_function(
            "extract_uniq_mutations"
        )
        self.kern_eval_all_scores = self.module.get_function("eval_all_scores")
        self.kern_select_uniq_id_c = self.module.get_function(
            "select_uniq_id_c"
        )
        self.kern_make_old_2_new_id_c = self.module.get_function(
            "make_old_2_new_id_c"
        )
        self.kern_count_vals = self.module.get_function("count_num")
        self.kern_update_gpu_vect = self.module.get_function(
            "update_gpu_vect_frags"
        )
        self.kern_explode_genome = self.module.get_function("explode_genome")
        if self.active_insert_blocks:
            self.kern_get_bounds = self.module.get_function("get_bounds")
            self.kern_extract_block = self.module.get_function("extract_block")
            self.kern_insert_block = self.module.get_function("insert_block")
        self.kern_frags_2_gl_pxl = self.module.get_function("gpu_struct_2_pxl")
        self.kern_update_matrix = self.module.get_function("update_matrix")
        self.kern_update_gl_buffer = self.module.get_function(
            "update_gl_buffer"
        )
        self.kern_prepare_sparse_call_4_gl = self.module.get_function(
            "prepare_sparse_call_4_gl"
        )

        self.set_null = self.module.get_function("set_null")
        self.copy_gpu_array = self.module.get_function("copy_gpu_array")
        self.gl_update_pos = self.module.get_function("gl_update_pos")
        self.gpu_transloc = []
        self.pop_out = self.module.get_function("pop_out_frag")
        self.flip_frag = self.module.get_function("flip_frag")
        self.pop_in_1 = self.module.get_function(
            "pop_in_frag_1"
        )  # split insert @ left
        self.pop_in_2 = self.module.get_function(
            "pop_in_frag_2"
        )  # split insert @ right
        self.pop_in_3 = self.module.get_function(
            "pop_in_frag_3"
        )  # insert @ left
        # self.pop_in_4 = self.module.get_function('pop_in_frag_4') # insert @
        # right
        self.split = self.module.get_function("split_contig")
        self.paste = self.module.get_function("paste_contigs")
        self.simple_copy = self.module.get_function("simple_copy")
        self.copy_vect = self.module.get_function("copy_struct")
        self.swap_activity = self.module.get_function("swap_activity_frag")
        self.modification_str = [
            "eject frag",
            "flip frag",
            "pop out split insert @ left or 1",
            "pop out split insert @ left or -1",
            "pop out split insert @ right or 1",
            "pop out split insert @ right or -1",
            "pop out insert @ right or 1",
            "pop out insert @ right or -1",
            # 'swap activity',
            # 'pop out insert @ left or 1', 'pop out insert @ left or -1',
            "transloc_1",
            "transloc_2",
            "transloc_3",
            "transloc_4",
            "local_scramble d1",
            "local_scramble d2",
            "local_scramble d3",
            "local_scramble d4",
        ]

[docs]    def update_neighbourhood(self,):
        tmp_sorted = self.hic_matrix_sub_sampled.argsort(axis=1)
        sorted_neighbours = []
        for i in self.list_frag_to_sample:
            all_idx = tmp_sorted[i, :]
            pos = np.nonzero(all_idx == i)[0][0]
            line = list(all_idx)
            line.pop(pos)
            logger.info("filtering neighbourhood of : {}".format(i))
            for j in self.list_to_pop_out:
                line = np.array(line)
                pos = np.nonzero(line == j)[0][0]
                line = list(line)
                line.pop(pos)

            sorted_neighbours.append(line)
        self.sorted_neighbours = np.array(sorted_neighbours)

[docs]    def pop_out_pop_in(self, id_f_pop, id_f_ins, mode, max_id):
        size_block = 1024
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags // size_block + 1), 1)
        # max_id = np.int32(ga.max(self.gpu_id_contigs).get())
        # print 'max_id contig before pop out= ', max_id
        start = cuda.Event()
        end = cuda.Event()
        start.record()

        self.pop_out(
            self.pop_gpu_vect_frags.get_ptr(),
            self.gpu_vect_frags.get_ptr(),
            self.pop_gpu_id_contigs,
            np.int32(id_f_pop),
            np.int32(max_id),
            self.n_new_frags,
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()

        max_id2 = np.int32(ga.max(self.pop_gpu_id_contigs).get())
        # print 'max_id contig after pop out= ', max_id2
        start.record()
        # modif_vector = self.collector_gpu_vect_frags[mode]
        or_watson = np.int32(1)
        or_crick = np.int32(-1)
        # print "max_id from pop = ", max_id
        if mode == 0:
            self.simple_copy(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.pop_gpu_vect_frags.get_ptr(),
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        elif mode == 1:
            self.flip_frag(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.gpu_vect_frags.get_ptr(),
                np.int32(id_f_pop),
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        elif mode == 2:
            self.pop_in_1(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.pop_gpu_vect_frags.get_ptr(),
                np.int32(id_f_pop),
                np.int32(id_f_ins),
                max_id2,
                or_watson,
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        elif mode == 3:
            self.pop_in_1(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.pop_gpu_vect_frags.get_ptr(),
                np.int32(id_f_pop),
                np.int32(id_f_ins),
                max_id2,
                or_crick,
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        elif mode == 4:
            self.pop_in_2(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.pop_gpu_vect_frags.get_ptr(),
                np.int32(id_f_pop),
                np.int32(id_f_ins),
                max_id2,
                or_watson,
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        elif mode == 5:
            self.pop_in_2(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.pop_gpu_vect_frags.get_ptr(),
                np.int32(id_f_pop),
                np.int32(id_f_ins),
                max_id2,
                or_crick,
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        elif mode == 6:
            self.pop_in_3(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.pop_gpu_vect_frags.get_ptr(),
                np.int32(id_f_pop),
                np.int32(id_f_ins),
                max_id2,
                or_watson,
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        elif mode == 7:
            self.pop_in_3(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.pop_gpu_vect_frags.get_ptr(),
                np.int32(id_f_pop),
                np.int32(id_f_ins),
                max_id2,
                or_crick,
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
        # elif mode == 8:
        # self.pop_in_4(self.collector_gpu_vect_frags[mode].get_ptr(),
        # self.pop_gpu_vect_frags.get_ptr(), np.int32(id_f_pop),
        # np.int32(id_f_ins), max_id2, or_watson, self.n_frags, block=block_,
        # grid=grid_) elif mode == 9:
        # self.pop_in_4(self.collector_gpu_vect_frags[mode].get_ptr(),
        # self.pop_gpu_vect_frags.get_ptr(), np.int32(id_f_pop),
        # np.int32(id_f_ins), max_id2, or_crick, self.n_frags, block=block_,
        # grid=grid_)

        # elif mode == 8:
        # self.swap_activity(self.collector_gpu_vect_frags[mode].get_ptr(),
        # self.pop_gpu_vect_frags.get_ptr(), np.int32(id_f_pop), max_id2,
        # self.n_new_frags, block=block_, grid=grid_)

        end.record()
        end.synchronize()
        # print "CUDA clock execution timing( generate mutations): ", secs

[docs]    def transloc(self, id_fA, id_fB, max_id):
        size_block = 128
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags // size_block + 1), 1)
        # max_id = np.int32(ga.max(self.gpu_id_contigs).get())
        start = cuda.Event()
        end = cuda.Event()
        mode = 0
        # id_start_transloc = 8
        id_start_transloc = 8
        for upstreamfA in range(0, 2):
            start.record()
            self.split(
                self.trans1_gpu_vect_frags.get_ptr(),
                self.gpu_vect_frags.get_ptr(),
                self.trans1_gpu_id_contigs,
                np.int32(id_fA),
                np.int32(upstreamfA),
                np.int32(max_id),
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
            end.record()
            end.synchronize()
            for upstreamfB in range(0, 2):
                max_id1 = np.int32(ga.max(self.trans1_gpu_id_contigs).get())
                # print 'max_id1 = ', max_id1
                start.record()
                self.split(
                    self.trans2_gpu_vect_frags.get_ptr(),
                    self.trans1_gpu_vect_frags.get_ptr(),
                    self.trans2_gpu_id_contigs,
                    np.int32(id_fB),
                    np.int32(upstreamfB),
                    max_id1,
                    self.n_new_frags,
                    block=block_,
                    grid=grid_,
                )
                end.record()
                end.synchronize()
                max_id2 = np.int32(ga.max(self.trans2_gpu_id_contigs).get())
                # print 'max_id2 = ', max_id2
                curr_vect_trans = self.collector_gpu_vect_frags[
                    id_start_transloc + mode
                ]
                start.record()
                # self.simple_copy(curr_vect_trans.get_ptr(),
                # self.trans2_gpu_vect_frags.get_ptr(), self.n_frags,
                # block=block_, grid=grid_)
                self.paste(
                    curr_vect_trans.get_ptr(),
                    self.trans2_gpu_vect_frags.get_ptr(),
                    np.int32(id_fA),
                    np.int32(id_fB),
                    max_id2,
                    self.n_new_frags,
                    block=block_,
                    grid=grid_,
                )
                end.record()
                end.synchronize()
                mode += 1

[docs]    def insert_blocks(self, id_fA, id_fB, max_id):
        size_block = self.size_block_4_sub
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags // size_block + 1), 1)
        # max_id = np.int32(ga.max(self.gpu_id_contigs).get())
        start = cuda.Event()
        end = cuda.Event()

        id_start_insert = 12

        start.record()
        self.gpu_list_valid_insert.fill(-1)
        self.gpu_list_f_upstream.fill(-1)
        self.gpu_list_f_downstream.fill(-1)
        self.ctx.synchronize()
        self.kern_get_bounds(
            self.gpu_vect_frags.get_ptr(),
            np.int32(id_fA),
            np.int32(id_fB),
            self.gpu_list_valid_insert,
            self.gpu_list_bounds,
            self.gpu_list_f_upstream,
            self.gpu_list_f_downstream,
            np.int32(self.n_insert_blocks),
            self.n_new_frags,
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()
        id = 0
        for i in range(0, self.n_insert_blocks):
            for j in [1, 0]:
                if j == 1:
                    list_bounds = self.gpu_list_f_upstream
                else:
                    list_bounds = self.gpu_list_f_downstream

                start.record()
                self.kern_extract_block(
                    self.trans1_gpu_vect_frags.get_ptr(),
                    self.gpu_vect_frags.get_ptr(),
                    self.trans1_gpu_id_contigs,
                    np.int32(id_fA),
                    list_bounds,
                    np.int32(i),  # id f in list_frag upstream
                    np.int32(j),  # upstream
                    np.int32(max_id),
                    self.n_new_frags,
                    block=block_,
                    grid=grid_,
                )
                end.record()
                end.synchronize()

                start.record()
                self.kern_insert_block(
                    self.collector_gpu_vect_frags[
                        id_start_insert + id
                    ].get_ptr(),
                    self.trans1_gpu_vect_frags.get_ptr(),
                    self.gpu_vect_frags.get_ptr(),
                    np.int32(id_fA),
                    np.int32(id_fB),
                    list_bounds,
                    self.gpu_list_valid_insert,
                    np.int32(id),
                    np.int32(i),
                    np.int32(j),
                    self.n_new_frags,
                    block=block_,
                    grid=grid_,
                )
                end.record()
                end.synchronize()
                id += 1

[docs]    def perform_mutations(self, id_fA, id_fB, max_id, is_first):
        for mode in range(0, 8):
            self.pop_out_pop_in(id_fA, id_fB, mode, max_id)
        self.transloc(id_fA, id_fB, max_id)
        if self.active_insert_blocks:
            self.insert_blocks(id_fA, id_fB, max_id)

        # for mode in xrange(14, self.n_tmp_struct):
        #     self.local_flip(id_fA, mode, max_id)
        # self.all_pop_out_pop_in(id_fA, id_fB, max_id, is_first)
        # tic_fillB = time.time()
        # self.all_transloc(id_fA, id_fB, max_id, is_first)
        # print "all_pop out time execution  = ", time.time() - tic_fillB

[docs]    def bomb_the_genome(self,):

        size_block = 256
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags // size_block + 1), 1)
        a = np.arange(0, self.n_new_frags, dtype=np.int32)
        np.random.shuffle(a)
        gpu_shuffle_order = ga.to_gpu(ary=a)
        self.ctx.synchronize()
        start = cuda.Event()
        end = cuda.Event()
        start.record()
        self.kern_explode_genome(
            self.gpu_vect_frags.get_ptr(),
            gpu_shuffle_order,
            self.n_new_frags,
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()
        self.modify_gl_cuda_buffer(0, self.gl_window.dt)

[docs]    def local_flip(self, id_fA, mode, max_id):
        # mode = 12
        size_block = 256
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags // size_block + 1), 1)
        start = cuda.Event()
        end = cuda.Event()
        local_delta = mode - 11
        vect_frags = self.gpu_vect_frags
        vect_frags.copy_from_gpu()

        pos_fA = vect_frags.pos[id_fA]
        id_contig_A = vect_frags.id_c[id_fA]
        len_contig_A = vect_frags.l_cont[id_fA]

        id_f_in_contig_A = np.nonzero(vect_frags.id_c == id_contig_A)[0]
        neighbours = id_f_in_contig_A
        pos_neighbours = vect_frags.pos[neighbours]

        arg_sort_id = np.argsort(pos_neighbours)

        ordered_neighbours = neighbours[arg_sort_id]
        orientations_neighbours = vect_frags.ori[ordered_neighbours]
        id_up = max(pos_fA - local_delta, 0)
        id_down = min(pos_fA + local_delta, len_contig_A - 1)
        # print "id_fA = ", id_fA
        # print "pos_fA = ", pos_fA
        # print "pos up = ", id_up
        # print "pos down = ", id_down

        start.record()
        self.simple_copy(
            self.scrambled_gpu_vect_frags.get_ptr(),
            self.gpu_vect_frags.get_ptr(),
            self.n_new_frags,
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()
        # print "ordered neighbours = ", ordered_neighbours
        for i in range(id_up, id_down + 1):
            id_fB = ordered_neighbours[i]
            if id_fB != id_fA:
                start.record()
                self.pop_out(
                    self.pop_gpu_vect_frags.get_ptr(),
                    self.scrambled_gpu_vect_frags.get_ptr(),
                    self.pop_gpu_id_contigs,
                    np.int32(id_fB),
                    max_id,
                    self.n_new_frags,
                    block=block_,
                    grid=grid_,
                )
                end.record()
                end.synchronize()
                start.record()
                self.simple_copy(
                    self.scrambled_gpu_vect_frags.get_ptr(),
                    self.pop_gpu_vect_frags.get_ptr(),
                    self.n_new_frags,
                    block=block_,
                    grid=grid_,
                )
                end.record()
                end.synchronize()
                self.scrambled_gpu_vect_frags.copy_from_gpu()
                max_id = self.scrambled_gpu_vect_frags.id_c.max()

        for j in range(id_down, pos_fA, -1):
            id_fB = ordered_neighbours[j]
            ori_fB = orientations_neighbours[j] * -1
            # print "id_fB = ", id_fB
            start.record()
            self.pop_in_4(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.scrambled_gpu_vect_frags.get_ptr(),
                np.int32(id_fB),
                np.int32(id_fA),
                max_id,
                np.int32(ori_fB),
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
            end.record()
            end.synchronize()
            start.record()
            self.simple_copy(
                self.scrambled_gpu_vect_frags.get_ptr(),
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
            end.record()
            end.synchronize()
            self.scrambled_gpu_vect_frags.copy_from_gpu()
            max_id = self.scrambled_gpu_vect_frags.id_c.max()
        # print "insert left ok"
        for j in range(id_up, pos_fA):
            id_fB = ordered_neighbours[j]
            ori_fB = orientations_neighbours[j] * -1
            # print "id_fB = ", id_fB
            start.record()
            self.pop_in_3(
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.scrambled_gpu_vect_frags.get_ptr(),
                np.int32(id_fB),
                np.int32(id_fA),
                max_id,
                np.int32(ori_fB),
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
            end.record()
            end.synchronize()
            start.record()
            self.simple_copy(
                self.scrambled_gpu_vect_frags.get_ptr(),
                self.collector_gpu_vect_frags[mode].get_ptr(),
                self.n_new_frags,
                block=block_,
                grid=grid_,
            )
            end.record()
            end.synchronize()
            self.scrambled_gpu_vect_frags.copy_from_gpu()
            max_id = self.scrambled_gpu_vect_frags.id_c.max()

        start.record()
        self.flip_frag(
            self.collector_gpu_vect_frags[mode].get_ptr(),
            self.scrambled_gpu_vect_frags.get_ptr(),
            np.int32(id_fA),
            self.n_new_frags,
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()

[docs]    def test_copy_struct(self, id_fA, id_f_sampled, mode, max_id):
        self.gpu_vect_frags.copy_from_gpu()
        # c = self.gpu_vect_frags
        # print 'id fA = ', id_fA
        # print 'id fB = ', id_f_sampled
        # print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
        # print 'id_cont id_fA = ', c.id_c[id_fA]
        # print 'id_cont id_fB = ', c.id_c[id_f_sampled]
        # print 'id_d id_fA = ', c.id_d[id_fA]
        # print 'id_d id_fB = ', c.id_d[id_f_sampled]
        # print 'rep id_fA = ', c.rep[id_fA]
        # print 'rep id_fB = ', c.rep[id_f_sampled]
        # print 'pos id_fA = ', c.pos[id_fA]
        # print 'pos id_fB = ', c.pos[id_f_sampled]
        # print 'l_cont id_fA = ', c.l_cont[id_fA]
        # print 'l_cont id_fB = ', c.l_cont[id_f_sampled]
        # print 'start_bp id_fA = ', c.start_bp[id_fA]
        # print 'start_bp id_fB = ', c.start_bp[id_f_sampled]
        # print 'l_cont_bp id_fA = ', c.l_cont_bp[id_fA]
        # print 'l_cont_bp id_fB = ', c.l_cont_bp[id_f_sampled]
        # print 'is circle cont id_fA =', c.circ[id_fA]
        # print 'is circle cont id_fB =', c.circ[id_f_sampled]
        # print 'prev id_fA = ', c.prev[id_fA]
        # print 'next id_fA = ', c.next[id_fA]
        # print 'prev id_fB = ', c.prev[id_f_sampled]
        # print 'next id_fB = ', c.next[id_f_sampled]
        # print '########################'
        if mode < 8:
            self.pop_out_pop_in(id_fA, id_f_sampled, mode, max_id)
        elif mode < 12:
            self.transloc(id_fA, id_f_sampled, max_id)
        elif mode >= 12 and self.active_insert_blocks:
            self.insert_blocks(id_fA, id_f_sampled, max_id)
        # else:
        #     self.local_scramble(id_fA, max_id)
        size_block = 1024
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags // size_block + 1), 1)
        start = cuda.Event()
        end = cuda.Event()
        start.record()
        # print 'mode = ', mode
        sampled_vect_frags = self.collector_gpu_vect_frags[mode]
        # sampled_vect_frags.copy_from_gpu()
        # plt.plot(sampled_vect_frags.id_c)
        # plt.show()

        self.copy_vect(
            self.gpu_vect_frags.get_ptr(),
            sampled_vect_frags.get_ptr(),
            self.gpu_id_contigs,
            self.n_new_frags,
            block=block_,
            grid=grid_,
            shared=0,
        )
        end.record()
        end.synchronize()
        # secs = start.time_till(end) * 1e-3
        # self.gpu_vect_frags.copy_from_gpu()
        # c = self.gpu_vect_frags
        # print '########################'
        # print 'id_cont id_fA = ', c.id_c[id_fA]
        # print 'id_cont id_fB = ', c.id_c[id_f_sampled]
        # print 'id_d id_fA = ', c.id_d[id_fA]
        # print 'id_d id_fB = ', c.id_d[id_f_sampled]
        # print 'rep id_fA = ', c.rep[id_fA]
        # print 'rep id_fB = ', c.rep[id_f_sampled]
        # print 'pos id_fA = ', c.pos[id_fA]
        # print 'pos id_fB = ', c.pos[id_f_sampled]
        # print 'l_cont id_fA = ', c.l_cont[id_fA]
        # print 'l_cont id_fB = ', c.l_cont[id_f_sampled]
        # print 'start_bp id_fA = ', c.start_bp[id_fA]
        # print 'start_bp id_fB = ', c.start_bp[id_f_sampled]
        # print 'l_cont_bp id_fA = ', c.l_cont_bp[id_fA]
        # print 'l_cont_bp id_fB = ', c.l_cont_bp[id_f_sampled]
        # print 'is circle cont id_fA =', c.circ[id_fA]
        # print 'is circle cont id_fB =', c.circ[id_f_sampled]
        # print 'prev id_fA = ', c.prev[id_fA]
        # print 'next id_fA = ', c.next[id_fA]
        # print 'prev id_fB = ', c.prev[id_f_sampled]
        # print 'next id_fB = ', c.next[id_f_sampled]
        # print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'

[docs]    def setup_rippe_parameters_4_simu(
        self, kuhn, lm, slope, d, val_inter, d_max
    ):

        kuhn = np.float32(kuhn)
        lm = np.float32(lm)
        c1 = np.float32(
            (0.53 * np.power(lm / kuhn, slope)) * np.power(kuhn, -3)
        )
        slope = np.float32(slope)
        d = np.float32(d)
        d_max = np.float32(d_max)
        val_inter = np.float32(val_inter)
        parameters = [kuhn, lm, slope, d]

        def rippe(param, dist):

            return (
                0.53
                * (param[0] ** -3.)
                * np.power((param[1] * dist / param[0]), (param[2]))
                * np.exp(
                    (param[3] - 2)
                    / ((np.power((param[1] * dist / param[0]), 2) + param[3]))
                )
            )

        rippe_inter = rippe(parameters, d_max)
        fact = val_inter / rippe_inter
        p = np.array(
            [(kuhn, lm, c1, slope, d, d_max, fact, val_inter)],
            dtype=self.param_simu_T,
        )
        return p

[docs]    def setup_rippe_parameters(self, param, d_max):

        kuhn, lm, slope, d, fact = param
        fact = np.float32(np.abs(fact))
        kuhn = np.float32(np.abs(kuhn))
        lm = np.float32(np.abs(lm))
        c1 = np.float32(
            (0.53 * np.power(lm / kuhn, slope)) * np.power(kuhn, -3)
        )
        slope = np.float32(slope)
        d = np.float32(d)
        fact = np.float32(fact)
        d_max = np.float32(d_max)
        p = np.array(
            [(kuhn, lm, c1, slope, d, d_max, fact, self.mean_value_trans)],
            dtype=self.param_simu_rippe,
        )
        return p

[docs]    def setup_model_parameters(self, param, d_max):

        d0, alpha_0, alpha_1, fact = param

        d0 = np.float32(d0)
        d_max = np.float32(d_max)
        alpha_0 = np.float32(alpha_0)
        alpha_1 = np.float32(alpha_1)
        fact = np.float32(fact)
        p = np.array(
            [(d0, d_max, alpha_0, alpha_1, fact, self.mean_value_trans)],
            dtype=self.param_simu_exp,
        )

        return p

[docs]    def estimate_parameters_rippe(
        self, max_dist_kb, size_bin_kb, display_graph
    ):
        """
        estimation by least square optimization of Rippe parameters on the
        experimental data
        :param max_dist_kb:
        :param size_bin_kb:
        """
        logger.info("estimation of the parameters of the model")
        self.bins = np.arange(
            size_bin_kb, max_dist_kb + size_bin_kb, size_bin_kb
        )

        self.mean_contacts = np.zeros_like(self.bins, dtype=np.float32)
        self.dict_collect = dict()
        self.gpu_vect_frags.copy_from_gpu()
        epsi = self.mean_value_trans
        for k in self.bins:
            self.dict_collect[k] = []
        for i in range(0, self.n_frags // 10):
            # print "frag i = ", i
            start = self.sparse_matrix.indptr[i]
            end = self.sparse_matrix.indptr[i + 1]
            id_j = self.sparse_matrix.indices[start:end]
            data = self.sparse_matrix.data[start:end]

            info_i = self.np_sub_frags_2_frags[i]
            init_id_fi = int(info_i[0])
            # pos_i = self.S_o_A_frags["pos"][init_id_fi]
            id_c_i = self.S_o_A_frags["id_c"][init_id_fi]
            s_i = (
                self.S_o_A_frags["start_bp"][init_id_fi] / 1000.0
                + self.np_sub_frags_2_frags[i][1]
            )
            len_kb_c_i = self.S_o_A_frags["l_cont_bp"][init_id_fi] / 1000
            # local_bins = np.arange(size_bin_kb, min(len_kb_c_i, max_dist_kb)
            # + size_bin_kb, size_bin_kb) local_storage =
            # np.zeros_like(local_bins, dtype=np.int32)
            #
            # for fj, dj in zip(id_j, data):
            #     info_j = self.np_sub_frags_2_frags[fj]
            #     init_id_fj = info_j[0]
            #     id_c_j = self.S_o_A_frags['id_c'][init_id_fj]
            #     if id_c_i == id_c_j:
            #         pos_j = self.S_o_A_frags['pos'][init_id_fj]
            #         s_j = self.S_o_A_frags['start_bp'][init_id_fj]/1000.0 +
            #         self.np_sub_frags_2_frags[fj][1]
            #         d = np.abs(s_i - s_j)
            #         if d < max_dist_kb:
            #             id_bin = d / size_bin_kb
            #             local_storage[id_bin] += dj
            #             # self.dict_collect[self.bins[id_bin]].append(dj)
            # # we have to add also the zeros
            # for bin, val in zip(local_bins, local_storage):
            #     # print "bin = ", bin
            #     self.dict_collect[bin].append(val)

            if size_bin_kb < len_kb_c_i:
                # local_bins = np.arange(size_bin_kb, min(len_kb_c_i,
                # max_dist_kb) + size_bin_kb, size_bin_kb)
                local_bins = np.arange(
                    size_bin_kb, max_dist_kb + size_bin_kb, size_bin_kb
                )
                local_storage = np.zeros_like(local_bins, dtype=np.int32)

                for fj, dj in zip(id_j, data):
                    info_j = self.np_sub_frags_2_frags[fj]
                    init_id_fj = int(info_j[0])
                    id_c_j = self.S_o_A_frags["id_c"][init_id_fj]
                    if id_c_i == id_c_j:
                        # pos_j = self.S_o_A_frags["pos"][init_id_fj]
                        s_j = (
                            self.S_o_A_frags["start_bp"][init_id_fj] / 1000.0
                            + self.np_sub_frags_2_frags[fj][1]
                        )
                        d = np.abs(s_i - s_j)
                        if d < max_dist_kb:
                            id_bin = int(d / size_bin_kb)
                            local_storage[id_bin] += dj
                            # self.dict_collect[self.bins[id_bin]].append(dj)
                # we have to add also the zeros
                for bin, val in zip(local_bins, local_storage):
                    # print "bin = ", bin
                    self.dict_collect[bin].append(val)

        for id_bin in range(0, len(self.bins)):
            k = self.bins[id_bin]
            self.mean_contacts[id_bin] = np.mean(self.dict_collect[k])

        for id_bin in range(0, len(self.bins)):
            k = self.bins[id_bin]
            tmp = np.mean(self.dict_collect[k])
            if np.isnan(tmp) or tmp == 0:
                # if np.isnan(tmp):
                # if np.isnan(tmp):
                #     print "removing nan"
                self.mean_contacts[id_bin] = np.nan
            else:
                self.mean_contacts[id_bin] = tmp + epsi

        self.mean_contacts_upd = []
        self.bins_upd = []

        for count, ele in zip(self.mean_contacts, self.bins):
            if not np.isnan(count):
                self.bins_upd.append(ele)
                self.mean_contacts_upd.append(count)

        self.bins_upd = np.array(self.bins_upd)
        # self.mean_contacts_upd.sort()
        # self.mean_contacts_upd.reverse()
        self.mean_contacts_upd = np.array(self.mean_contacts_upd)

        # self.mean_contacts_upd =
        # ndi.filters.gaussian_filter1d(self.mean_contacts_upd,
        # sigma=len(self.bins_upd) / 5.)

        # print "size mean contacts vector = ", self.mean_contacts_upd.shape
        # print "mean contacts vector = ", self.mean_contacts_upd

        p, self.y_estim = opti.estimate_param_rippe(
            self.mean_contacts_upd, self.bins_upd
        )
        ##########################################
        logger.info("p from estimate parameters  = {}".format(p))
        # p = list(p[0])
        # p[3] = 2
        # p = tuple(p)
        ##########################################
        fit_param = p
        logger.info("mean value trans = {}".format(self.mean_value_trans))
        ##########################################
        logger.info("BEWARE!!! : I will lower mean value trans  !!!")
        self.mean_value_trans = self.mean_value_trans / 10.0
        ##########################################

        estim_max_dist = opti.estimate_max_dist_intra(
            fit_param, self.mean_value_trans
        )
        logger.info("estimate max dist cis trans = {}".format(estim_max_dist))
        self.param_simu = self.setup_rippe_parameters(
            fit_param, estim_max_dist
        )
        self.param_simu_test = self.param_simu
        self.gpu_param_simu = cuda.mem_alloc(self.param_simu.nbytes)
        self.gpu_param_simu_test = cuda.mem_alloc(self.param_simu.nbytes)

        cuda.memcpy_htod(self.gpu_param_simu, self.param_simu)
        cuda.memcpy_htod(self.gpu_param_simu_test, self.param_simu_test)
        display_graph = False
        if display_graph:
            plt.figure()
            plt.loglog(self.bins_upd, self.mean_contacts_upd, "-*b")
            plt.loglog(self.bins_upd, self.y_estim, "-*r")
            plt.xlabel("genomic distance (kb)")
            plt.ylabel("frequency of contact")
            plt.title(
                r"$\mathrm{Frequency\ of\ contact\ versus\ genomic\ distance"
                " \(data\ tricho test):}\ slope=%.3f,\ max\ cis\ distance(kb)"
                "=%.3f\ d=%.3f\ scale\ factor=%.3f\ $"
                % (
                    self.param_simu["slope"],
                    estim_max_dist,
                    self.param_simu["d"],
                    self.param_simu["fact"],
                )
            )
            plt.legend(["obs", "fit"])
            plt.savefig()
            plt.close()

        self.eval_likelihood_init()

[docs]    def estimate_parameters(self, max_dist_kb, size_bin_kb, display_graph):
        """
        estimation by least square optimization of Rippe parameters on the
        experimental data
        :param max_dist_kb:
        :param size_bin_kb:
        """
        logger.info("estimation of the parameters of the model")
        self.bins = np.arange(
            size_bin_kb, max_dist_kb + size_bin_kb, size_bin_kb
        )

        self.mean_contacts = np.zeros_like(self.bins, dtype=np.float32)
        self.dict_collect = dict()
        self.gpu_vect_frags.copy_from_gpu()
        epsi = self.mean_value_trans
        for k in self.bins:
            self.dict_collect[k] = []
        for i in range(0, 2000):
            # print "frag i = ", i
            start = self.sparse_matrix.indptr[i]
            end = self.sparse_matrix.indptr[i + 1]
            id_j = self.sparse_matrix.indices[start:end]
            data = self.sparse_matrix.data[start:end]

            info_i = self.np_sub_frags_2_frags[i]
            init_id_fi = info_i[0]
            # pos_i = self.S_o_A_frags["pos"][init_id_fi]
            id_c_i = self.S_o_A_frags["id_c"][init_id_fi]
            s_i = (
                self.S_o_A_frags["start_bp"][init_id_fi] / 1000.0
                + self.np_sub_frags_2_frags[i][1]
            )
            len_kb_c_i = self.S_o_A_frags["l_cont_bp"][init_id_fi] / 1000
            local_bins = np.arange(
                size_bin_kb,
                min(len_kb_c_i, max_dist_kb) + size_bin_kb,
                size_bin_kb,
            )
            local_storage = np.zeros_like(local_bins, dtype=np.int32)

            for fj, dj in zip(id_j, data):
                info_j = self.np_sub_frags_2_frags[fj]
                init_id_fj = info_j[0]
                id_c_j = self.S_o_A_frags["id_c"][init_id_fj]
                if id_c_i == id_c_j:
                    # pos_j = self.S_o_A_frags["pos"][init_id_fj]
                    s_j = (
                        self.S_o_A_frags["start_bp"][init_id_fj] / 1000.0
                        + self.np_sub_frags_2_frags[fj][1]
                    )
                    d = np.abs(s_i - s_j)
                    if d < max_dist_kb:
                        id_bin = d / size_bin_kb
                        local_storage[id_bin] += dj
                        # self.dict_collect[self.bins[id_bin]].append(dj)
            # we have to add also the zeros
            for my_bin, val in zip(local_bins, local_storage):
                # print "bin = ", bin
                self.dict_collect[my_bin].append(val)

        for id_bin in range(0, len(self.bins)):
            k = self.bins[id_bin]
            self.mean_contacts[id_bin] = np.mean(self.dict_collect[k])

        for id_bin in range(0, len(self.bins)):
            k = self.bins[id_bin]
            tmp = np.mean(self.dict_collect[k])
            if np.isnan(tmp) or tmp == 0:
                # if np.isnan(tmp):
                # if np.isnan(tmp):
                #     print "removing nan"
                self.mean_contacts[id_bin] = np.nan
            else:
                self.mean_contacts[id_bin] = tmp + epsi

        self.mean_contacts_upd = []
        self.bins_upd = []

        for count, ele in zip(self.mean_contacts, self.bins):
            if not np.isnan(count):
                self.bins_upd.append(ele)
                self.mean_contacts_upd.append(count)

        self.bins_upd = np.array(self.bins_upd)
        self.mean_contacts_upd = np.array(self.mean_contacts_upd)

        # self.mean_contacts_upd =
        # ndi.filters.gaussian_filter1d(self.mean_contacts_upd,
        # sigma=len(self.bins_upd) / 5.)

        p, self.y_estim = nuis.estimate_param_hic(
            self.mean_contacts_upd, self.bins_upd
        )
        ##########################################
        fit_param = p.x
        ##########################################
        logger.info("mean value trans = {}".format(self.mean_value_trans))
        ##########################################
        estim_max_dist = nuis.estimate_max_dist_intra(
            fit_param, self.mean_value_trans
        )
        logger.info("max distance cis/trans = {}".format(estim_max_dist))
        ##########################################
        self.param_simu = self.setup_model_parameters(
            fit_param, estim_max_dist
        )
        self.gpu_param_simu = cuda.mem_alloc(self.param_simu.nbytes)
        self.gpu_param_simu_test = cuda.mem_alloc(self.param_simu.nbytes)

        cuda.memcpy_htod(self.gpu_param_simu, self.param_simu)

        if display_graph:
            plt.loglog(self.bins_upd, self.mean_contacts_upd, "-*b")
            plt.loglog(self.bins_upd, self.y_estim, "-*r")
            plt.xlabel("genomic distance (kb)")
            plt.ylabel("frequency of contact")
            plt.legend(["obs", "fit"])
            plt.show()

[docs]    def insert_repeats(self, id_f_ins):
        for id in range(0, self.n_new_frags):
            self.gpu_vect_frags.copy_from_gpu()
            max_id = self.gpu_vect_frags.id_c.max()
            if self.gpu_vect_frags.rep[id] == 1:
                logger.info("id repeats = {}".format(id))
                mode = 7
                self.test_copy_struct(id, id_f_ins, mode, max_id)

[docs]    def modify_genome(self, n):
        list_breaks = np.random.choice(self.n_new_frags, n * 2, replace=False)
        list_modes = np.random.choice(self.n_tmp_struct, n, replace=True)
        for i in range(0, n):
            self.gpu_vect_frags.copy_from_gpu()
            max_id = self.gpu_vect_frags.id_c.max()
            self.test_copy_struct(
                list_breaks[2 * i],
                list_breaks[2 * i + 1],
                list_modes[i],
                max_id,
            )
            self.gpu_vect_frags.copy_from_gpu()
            c = self.gpu_vect_frags
            if (
                np.any(c.pos < 0)
                or np.any(c.l_cont < 0)
                or np.any(c.l_cont_bp < 0)
                or np.any(c.start_bp < 0)
                or np.any(c.l_cont_bp - c.start_bp <= 0)
                or np.any((c.start_bp != 0) * (c.pos == 0))
                or np.any((c.start_bp == 0) * (c.pos != 0))
                or np.any(c.__next__ == c.id)
                or np.any(c.prev == c.id)
            ):
                logger.info("problem!!!!")
                input("what shoud I do????")
            if np.any(c.l_cont == 0) or np.any(c.l_cont_bp == 0):
                logger.info("problem null contig !!!!")
                input("what shoud I do????")

[docs]    def explode_genome(self, dt):
        for i in range(0, self.n_new_frags):
            # print "frag id = ", i
            self.modify_gl_cuda_buffer(i, dt)
            self.gpu_vect_frags.copy_from_gpu()
            max_id = self.gpu_vect_frags.id_c.max()
            self.test_copy_struct(i, 0, 0, max_id)
            self.gpu_vect_frags.copy_from_gpu()
            c = self.gpu_vect_frags
            self.gl_window.remote_update()
            if (
                np.any(c.pos < 0)
                or np.any(c.l_cont < 0)
                or np.any(c.l_cont_bp < 0)
                or np.any(c.start_bp < 0)
                or np.any(c.l_cont_bp - c.start_bp <= 0)
                or np.any((c.start_bp != 0) * (c.pos == 0))
                or np.any((c.start_bp == 0) * (c.pos != 0))
                or np.any(c.__next__ == c.id)
                or np.any(c.prev == c.id)
            ):
                logger.info("problem!!!!")
                input("what shoud I do????")
            if np.any(c.l_cont == 0) or np.any(c.l_cont_bp == 0):
                logger.info("problem null contig !!!!")
                input("what shoud I do????")
        logger.info("genome exploded")
        logger.info("max id = {}".format(max_id))

[docs]    def apply_replay_simu(self, id_fA, id_fB, op_sampled, dt):

        # n_modif = len(op_sampled)
        # for i in xrange(0, n_modif):
        #     self.modify_gl_cuda_buffer(i, dt)
        #     self.gpu_vect_frags.copy_from_gpu()
        #     max_id = self.gpu_vect_frags.id_c.max()
        #     self.test_copy_struct(id_fA[i], id_fB[i], op_sampled[i], max_id)
        #     self.gpu_vect_frags.copy_from_gpu()
        #     c = self.gpu_vect_frags
        #     self.gl_window.remote_update()

        self.modify_gl_cuda_buffer(id_fA, dt)
        self.gpu_vect_frags.copy_from_gpu()
        max_id = self.gpu_vect_frags.id_c.max()
        self.test_copy_struct(id_fA, id_fB, op_sampled, max_id)
        self.gpu_vect_frags.copy_from_gpu()
        # c = self.gpu_vect_frags
        self.gl_window.remote_update()

[docs]    def display_current_matrix(self, filename):
        self.gpu_vect_frags.copy_from_gpu()
        c = self.gpu_vect_frags
        self.gpu_id_contigs.get(ary=self.cpu_id_contigs)
        pos_frag = np.copy(self.gpu_vect_frags.pos)
        list_id_frags = np.copy(self.gpu_vect_frags.id_d)
        list_id = np.copy(self.cpu_id_contigs)
        list_activ = np.copy(self.gpu_vect_frags.activ)
        unique_contig_id = np.unique(list_id)
        dict_contig = dict()
        full_order = []
        full_order_high = []
        for k in unique_contig_id:
            dict_contig[k] = []
            id_pos = np.ix_(list_id == k)
            is_activ = list_activ[id_pos]
            if np.all(is_activ == 1):
                tmp_ord = np.argsort(pos_frag[id_pos])
                ordered_frag = list_id_frags[id_pos[0][tmp_ord]]
                dict_contig[k].extend(ordered_frag)
                full_order.extend(ordered_frag)
                for i in ordered_frag:
                    ori = c.ori[i]
                    v_high_id_all = list(self.np_sub_frags_id[i])
                    # print "v_high_id_all = ", v_high_id_all
                    # print "sub id =", range(0,v_high_id_all[3])
                    v_high_id = v_high_id_all[: v_high_id_all[3]]
                    id_2_push = list(v_high_id)
                    if ori == -1:
                        id_2_push.reverse()
                    full_order_high.extend(id_2_push)
        # fig = plt.figure(figsize=(10,10))
        # val_max = self.hic_matrix_sub_sampled.max() * 0.01
        # plt.imshow(self.hic_matrix_sub_sampled[np.ix_(full_order,full_order)],
        # vmin=0, vmax=50, interpolation='nearest')
        # fig.savefig(file)
        # plt.show()
        # plt.close()
        # plt.figure()
        # plt.imshow(self.hic_matrix[np.ix_(full_order_high, full_order_high)],
        #            vmin=0, vmax=20,
        #            interpolation='nearest')
        # plt.show()
        matrix = self.hic_matrix[np.ix_(full_order_high, full_order_high)]

        plt.imshow(matrix, vmax=np.percentile(matrix, 99))
        plt.savefig(filename)
        return full_order, dict_contig, full_order_high

[docs]    def genome_content(self):

        self.gpu_vect_frags.copy_from_gpu()
        self.gpu_id_contigs.get(ary=self.cpu_id_contigs)
        pos_frag = np.copy(self.gpu_vect_frags.pos)
        next_frag = np.copy(self.gpu_vect_frags.__next__)
        prev_frag = np.copy(self.gpu_vect_frags.prev)
        start_bp = np.copy(self.gpu_vect_frags.start_bp)
        list_id_frags = np.copy(self.gpu_vect_frags.id_d)
        list_activ = np.copy(self.gpu_vect_frags.activ)
        list_id = np.copy(self.gpu_vect_frags.id_c)
        unique_contig_id = np.unique(list_id)
        dict_contig = dict()

        full_order = []
        for k in unique_contig_id:
            dict_contig[k] = dict()
            dict_contig[k]["id"] = []
            dict_contig[k]["pos"] = []
            dict_contig[k]["next"] = []
            dict_contig[k]["prev"] = []
            dict_contig[k]["start_bp"] = []
            dict_contig[k]["id_c"] = []

            id_pos = np.ix_(list_id == k)[0]
            if np.all(list_activ[id_pos] == 1):
                # print "len id_pos = ", id_pos[0]
                tmp_ord = np.argsort(pos_frag[id_pos])
                # print "tmp_ord = ", tmp_ord
                l_start = start_bp[id_pos[tmp_ord]]
                l_pos = pos_frag[id_pos[tmp_ord]]
                l_id_c = list_id[id_pos[tmp_ord]]
                l_next = next_frag[id_pos[tmp_ord]]
                l_prev = prev_frag[id_pos[tmp_ord]]
                ordered_frag = list_id_frags[id_pos[tmp_ord]]
                dict_contig[k]["id"].extend(ordered_frag)
                dict_contig[k]["pos"].extend(l_pos)
                dict_contig[k]["start_bp"].extend(l_start)
                dict_contig[k]["id_c"].extend(l_id_c)
                dict_contig[k]["prev"].extend(l_prev)
                dict_contig[k]["next"].extend(l_next)
                full_order.extend(ordered_frag)
        return full_order, dict_contig

[docs]    def load_gl_cuda_vbo(self,):
        # CUDA Ressorces
        self.pos_vbo.bind()

        # Depends on whether pyopengl_accelerate is disabled
        try:
            self.gpu_pos = cudagl.RegisteredBuffer(
                int(self.pos_vbo.buffers[0]), cudagl.graphics_map_flags.NONE
            )
            self.gpu_col = cudagl.RegisteredBuffer(
                int(self.col_vbo.buffers[0]), cudagl.graphics_map_flags.NONE
            )
        except AttributeError:
            self.gpu_pos = cudagl.RegisteredBuffer(
                int(self.pos_vbo.buffer), cudagl.graphics_map_flags.NONE
            )
            self.gpu_col = cudagl.RegisteredBuffer(
                int(self.col_vbo.buffer), cudagl.graphics_map_flags.NONE
            )
        self.col_vbo.bind()
        self.gpu_vel = ga.to_gpu(ary=self.vel)

        self.pos_gen_cuda = cuda.mem_alloc(self.pos.nbytes)
        cuda.memcpy_htod(self.pos_gen_cuda, self.pos)
        self.vel_gen_cuda = cuda.mem_alloc(self.vel.nbytes)
        cuda.memcpy_htod(self.vel_gen_cuda, self.vel)

        self.ctx.synchronize()

[docs]    def load_gl_cuda_tex_buffer(self, im_init):
        self.cuda_pbo_resource = cudagl.BufferObject(
            int(self.pbo_im_buffer)
        )  # Mapping GLBuffer to cuda_resource
        self.gpu_im_gl = ga.zeros(
            (self.gl_size_im, self.gl_size_im), dtype=np.int32
        )

        # self.array = cuda.matrix_to_array(im_init, "C") # C-style instead of
        # Fortran-style: row-major self.array = ga.to_gpu(ary=im_init) #
        # C-style instead of Fortran-style: row-major

        # self.texref.set_array(self.array)
        # self.texref.set_flags(cuda.TRSA_OVERRIDE_FORMAT)

[docs]    def modify_image_thresh(self, val):
        self.im_thresh += val
        self.im_thresh = min(self.im_thresh, 255)
        self.im_thresh = max(self.im_thresh, 1)
        self.im_thresh = np.uint8(self.im_thresh)
        # print "threshold image = ", self.im_thresh
        # size_block = (16, 16, 1)
        # size_grid = (int(self.n_frags / 16) + 1, int(self.n_frags / 16) + 1)
        # mapping_obj = self.cuda_pbo_resource.map()
        # im_2_update = mapping_obj.device_ptr()
        # self.gl_update_im_thresh(np.intp(im_2_update), self.im_thresh,
        # block=size_block, grid=size_grid, texrefs=[self.texref])
        # self.ctx.synchronize()
        # mapping_obj.unmap() # Unmap the GlBuffer

[docs]    def modify_gl_cuda_buffer(self, id_fi, dt):

        start = cuda.Event()
        end = cuda.Event()
        size_block = 512
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags / size_block + 1), 1)
        self.gpu_counter_select_gl.fill(0)
        self.ctx.synchronize()
        self.kern_select_uniq_id_c(
            self.gpu_vect_frags.get_ptr(),
            self.gpu_uniq_id_c,
            self.gpu_uniq_len,
            self.gpu_counter_select_gl,  # retrieve genome contig count&
            np.int32(self.n_new_frags),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()

        self.n_contigs = self.gpu_counter_select_gl.get()[0]
        self.thrust_module.sort_by_keys_simple(
            self.gpu_uniq_len, int(self.n_contigs), self.gpu_uniq_id_c
        )  # sort the contigs by length

        self.cpu_length_contigs = np.float32(self.gpu_uniq_len.get())
        self.mean_length_contigs = self.cpu_length_contigs[
            : self.n_contigs
        ].mean()

        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_contigs / size_block + 1), 1)
        start.record()
        self.kern_make_old_2_new_id_c(
            self.gpu_uniq_id_c,
            self.gpu_old_2_new_id_c,  # structure to update contig ids
            np.int32(self.n_contigs),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()

        self.gpu_counter_select_gl.fill(0)
        self.ctx.synchronize()
        start.record()
        self.kern_count_vals(
            self.gpu_uniq_len,
            np.int32(1),
            self.gpu_counter_select_gl,  # collect contigs with length 1
            np.int32(self.n_contigs),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()
        size_block = 1024
        block_ = (size_block, 1, 1)
        grid_ = (int(self.n_new_frags // size_block) + 1, 1)
        map_pos = self.gpu_pos.map()
        (pos_ptr, pos_siz) = map_pos.device_ptr_and_size()
        map_col = self.gpu_col.map()
        (col_ptr, col_siz) = map_col.device_ptr_and_size()

        max_id = np.float32(self.n_contigs - 1)

        # self.gpu_vect_frags.copy_from_gpu()
        # c = self.gpu_vect_frags
        # n_un = np.nonzero(c.l_cont == 1)[0].shape[0]
        # self.n_contigs_un = self.gpu_counter_select_gl.get()[0]
        # if n_un != self.n_contigs_un:
        #     raw_input(" WTF?....")

        # update pos particles #####
        self.gl_update_pos(
            self.gpu_uniq_len,
            np.intp(pos_ptr),
            np.intp(col_ptr),
            self.gpu_vel,
            self.pos_gen_cuda,
            self.vel_gen_cuda,
            self.gpu_vect_frags.get_ptr(),
            self.gpu_old_2_new_id_c,
            self.gpu_id_contigs,
            max_id,
            self.n_new_frags,
            np.int32(id_fi),
            self.gpu_counter_select_gl,
            self.rng_states,
            np.int32(self.n_generators),
            dt,
            block=block_,
            grid=grid_,
        )
        self.ctx.synchronize()
        map_pos.unmap()
        map_col.unmap()

        # update image #####
        # compute cumulated length thrust...
        self.thrust_module.prefix_sum(self.gpu_uniq_len, int(self.n_contigs))
        # self.d = self.gpu_uniq_len.get()
        # print "prefix sum done!"
        max_id = np.int32(max_id)
        self.kern_frags_2_gl_pxl(
            self.gpu_vect_frags.get_ptr(),
            self.gpu_vect_gl_pxl_frag,
            self.gpu_uniq_len,
            np.int32(max_id),
            np.float32(self.gl_size_im),
            self.n_new_frags,
            block=block_,
            grid=grid_,
        )
        self.ctx.synchronize()
        # print "frags to pixels!"
        # self.e = self.gpu_vect_gl_pxl_frag.get()
        mapping_obj = self.cuda_pbo_resource.map()
        im_2_update = mapping_obj.device_ptr()

        size_block_4_gl = 1024
        block_ = (int(size_block_4_gl), 1, 1)
        grid_ = (int(self.n_data_4_gl / size_block_4_gl + 1), 1)
        # print "grid = ", grid_
        self.gpu_counter_select_4_gl.fill(0)
        self.ctx.synchronize()
        self.kern_prepare_sparse_call_4_gl(
            self.gpu_rows_4_gl,
            self.gpu_cols_4_gl,
            self.gpu_ptr_4_gl,
            self.gpu_vect_gl_pxl_frag,
            self.gpu_info_blocks,
            self.gpu_counter_select_4_gl,
            np.int32(self.gl_size_im),
            np.int32(self.n_data_4_gl),
            block=block_,
            grid=grid_,
        )
        self.ctx.synchronize()
        self.gpu_im_gl.fill(0)
        self.ctx.synchronize()
        self.kern_update_matrix(
            self.gpu_rows_4_gl,
            self.gpu_cols_4_gl,
            self.gpu_data_4_gl,
            self.gpu_ptr_4_gl,
            self.gpu_info_blocks,
            self.gpu_vect_gl_pxl_frag,
            self.gpu_im_gl,
            np.int32(self.gl_size_im),
            np.int32(self.n_data_4_gl),
            block=block_,
            grid=grid_,
        )
        self.ctx.synchronize()

        self.im_thresh = max(1, self.im_thresh)
        all_pix_gl = self.gl_size_im ** 2
        grid_all = (int(all_pix_gl / 1024 + 1), 1)
        self.kern_update_gl_buffer(
            np.intp(im_2_update),
            self.gpu_im_gl,
            np.int32(self.im_thresh),
            np.int32(all_pix_gl),
            block=(1024, 1, 1),
            grid=grid_all,
        )
        self.ctx.synchronize()
        mapping_obj.unmap()  # Unmap the GlBuffer
        # print "time to update gl image = ", t1 - t0
        return max_id

[docs]    def test_thrust(self):
        t_start = time.time()
        self.thrust_module.sort_by_keys_zip(
            self.gpu_sub_sp_no_rep_rows,
            int(self.n_sub_vals),
            self.gpu_sub_sp_no_rep_cols,
        )
        t_end = time.time()
        logger.info(
            "GPU thrust sort 1: elapsed time  = {}".format(t_end - t_start)
        )
        self.thrust_module.sort_by_keys_zip(
            self.gpu_sub_sp_no_rep_rows,
            int(self.n_sub_vals),
            self.gpu_sub_sp_no_rep_data,
        )
        t_end_2 = time.time()
        logger.info(
            "GPU thrust sort 2: elapsed time  = {}".format(t_end_2 - t_end)
        )

[docs]    def prepare_sparse_call(self):
        size_block = 512
        block_ = (size_block, 1, 1)
        n_blocks = int(self.n_sub_vals // size_block + 1)
        grid_ = (n_blocks, 1)
        start = cuda.Event()
        end = cuda.Event()

        self.gpu_counter_select.fill(0)
        self.ctx.synchronize()

        self.gpu_sub_sp_block_indptr = ga.to_gpu(
            np.zeros((self.n_sub_vals,), dtype=np.int32)
        )
        self.gpu_info_blocks = ga.to_gpu(
            np.zeros((n_blocks,), dtype=self.int3)
        )
        # print "grid = ", grid_
        self.gpu_counter_select.fill(0)
        self.ctx.synchronize()

        start.record()
        self.kern_prepare_sparse_call(
            self.gpu_sub_sp_no_rep_rows,
            self.gpu_info_blocks,
            self.gpu_sub_sp_block_indptr,
            self.gpu_counter_select,
            np.int32(self.n_sub_vals),
            block=block_,
            grid=grid_,
        )
        end.record()
        end.synchronize()
        elapsed_seconds = end.time_since(start) * 1e-3
        logger.debug(
            "GPU prepare sparse call: elapsed time  = {}".format(
                elapsed_seconds
            )
        )

[docs]    def return_rippe_vals(self, p0):
        y_eval = opti.peval(self.bins, p0)
        return y_eval

[docs]    def f_rippe(self, x, param):

        kuhn, lm, c1, slope, d, d_max, fact, d_nuc = param[0]

        if x < d_max:
            rippe = fact * (
                0.53
                * (kuhn ** -3.)
                * np.power((lm * x / kuhn), slope)
                * np.exp((d - 2) / ((np.power((lm * x / kuhn), 2) + d)))
            )
        else:
            rippe = d_nuc

        return rippe

[docs]    def f_hic(self, x, param):
        d_init, d_max, alpha_0, alpha_1, A, v_inter = param[0]
        hic_c = np.zeros(x.shape)
        val_lim_0 = A * np.power(d_init, alpha_0 - alpha_1)
        for i in range(0, len(hic_c)):
            if x[i] < d_init:
                hic_c[i] = A * np.power(x[i], alpha_0)
            else:
                hic_c[i] = val_lim_0 * np.power(x[i], alpha_1)

        return hic_c

[docs]    def step_nuisance_parameters(self, dt, t, n_step):

        self.gpu_vect_frags.copy_from_gpu()
        # max_id = self.modify_gl_cuda_buffer(0, dt)
        self.gl_window.remote_update()

        curr_param = np.copy(self.param_simu)
        kuhn, lm, c1, slope, d, d_max, fact, d_nuc = curr_param[0]
        # print " curr param = ", curr_param[0]
        self.sigma_fact = 10 ** (np.log10(fact) - 2)  # for G1
        self.sigma_slope = 0.005  # test malaysian
        self.sigma_d_max = 100  # test simu s1
        self.sigma_d_nuc = 10 ** (np.log10(d_nuc) - 2)  # test s1
        self.sigma_d = 10  # ok for s1
        # randomly select a modifier
        id_modif = np.random.choice(4)
        # id_modif = np.random.choice(4)
        # print "id_modif", id_modif

        if id_modif == 0:  # scale factor
            new_fact = fact + np.random.normal(loc=0.0, scale=self.sigma_fact)
            test_param = [kuhn, lm, slope, d, new_fact]
            # new_d_max = opti.estimate_max_dist_intra(test_param, d_nuc)
            new_d_max = opti.estimate_max_dist_intra_nuis(
                test_param, d_nuc, d_max
            )
            c1 = np.float32(
                (0.53 * np.power(lm / kuhn, slope)) * np.power(kuhn, -3)
            )
            out_test_param = [
                (kuhn, lm, c1, slope, d, new_d_max, new_fact, d_nuc)
            ]
        elif id_modif == 1:  # slope
            new_slope = slope + np.random.normal(
                loc=0.0, scale=self.sigma_slope
            )
            test_param = [kuhn, lm, new_slope, d, fact]
            # new_d_max = opti.estimate_max_dist_intra(test_param, d_nuc)
            new_d_max = opti.estimate_max_dist_intra_nuis(
                test_param, d_nuc, d_max
            )
            c1 = np.float32(
                (0.53 * np.power(lm / kuhn, new_slope)) * np.power(kuhn, -3)
            )
            out_test_param = [
                (kuhn, lm, c1, new_slope, d, new_d_max, fact, d_nuc)
            ]
        elif id_modif == 2:  # max distance intra
            new_d_max = d_max + np.random.normal(
                loc=0.0, scale=self.sigma_d_max
            )
            test_param = [kuhn, lm, slope, d, fact]
            new_d_nuc = opti.peval(new_d_max, test_param)
            c1 = np.float32(
                (0.53 * np.power(lm / kuhn, slope)) * np.power(kuhn, -3)
            )
            out_test_param = [
                (kuhn, lm, c1, slope, d, new_d_max, fact, new_d_nuc)
            ]
        elif id_modif == 3:  # val trans
            if self.sigma_d_nuc <= 0:
                new_d_nuc = d_nuc
            else:
                new_d_nuc = d_nuc + np.random.normal(
                    loc=0.0, scale=self.sigma_d_nuc
                )
            test_param = [kuhn, lm, slope, d, fact]
            # new_d_max = opti.estimate_max_dist_intra(test_param, new_d_nuc)
            new_d_max = opti.estimate_max_dist_intra_nuis(
                test_param, new_d_nuc, d_max
            )
            c1 = np.float32(
                (0.53 * np.power(lm / kuhn, slope)) * np.power(kuhn, -3)
            )
            out_test_param = [
                (kuhn, lm, c1, slope, d, new_d_max, fact, new_d_nuc)
            ]
        else:  # d
            new_d = d + np.random.normal(loc=0.0, scale=self.sigma_d)

            test_param = [kuhn, lm, slope, new_d, fact]
            # new_d_max = opti.estimate_max_dist_intra(test_param, d_nuc)
            new_d_max = opti.estimate_max_dist_intra_nuis(
                test_param, d_nuc, d_max
            )
            c1 = np.float32(
                (0.53 * np.power(lm / kuhn, slope)) * np.power(kuhn, -3)
            )
            out_test_param = [
                (kuhn, lm, c1, slope, new_d, new_d_max, fact, d_nuc)
            ]

        out_test_param = np.array(out_test_param, dtype=self.param_simu_T)
        # print "test param = ", test_param
        # print "out test param = ",out_test_param
        self.param_simu_test = out_test_param
        cuda.memcpy_htod(self.gpu_param_simu_test, out_test_param)

        self.likelihood_nuis = self.eval_likelihood_4_nuisance()
        F_t = self.temperature(t, n_step)
        ratio = np.exp((self.likelihood_nuis - self.likelihood_t) / F_t)
        u = np.random.rand()
        success = 0
        if ratio >= u:
            # print "success"
            success = 1
            cuda.memcpy_htod(self.gpu_param_simu, out_test_param)
            self.param_simu = out_test_param
            self.likelihood_t = self.likelihood_nuis
        # else:
        #     print "reject"

        kuhn, lm, c1, slope, d, d_max, fact, d_nuc = self.param_simu[0]
        p0 = [kuhn, lm, slope, d, fact]
        y_rippe = self.return_rippe_vals(p0)
        return (
            fact,
            d,
            d_max,
            d_nuc,
            slope,
            self.likelihood_t,
            success,
            y_rippe,
        )

[docs]    def setup_distri_frags(self,):
        # generates random variables for every frags
        # TO DO : prendre en compte la composition initiale pour les probas!!!

        logger.info("setup jumping distribution: start")
        self.distri_frags = dict()
        fact = 3.0
        # print "N frags = ", self.n_frags
        logger.info(
            "Shape sub mat = {}".format(
                self.sym_sub_sampled_sparse_matrix.shape
            )
        )
        # print "shape indices = ",
        # self.sub_sampled_sparse_matrix.indices.shape
        # print "shape indptr = ", self.sub_sampled_sparse_matrix.indptr.shape
        for i in range(0, self.n_frags):
            start = self.sym_sub_sampled_sparse_matrix.indptr[i]
            end = self.sym_sub_sampled_sparse_matrix.indptr[i + 1]
            # print "id_sp max= ", np.max(id_sp)
            vk = self.sym_sub_sampled_sparse_matrix.data[start:end]
            yk = self.sym_sub_sampled_sparse_matrix.indices[start:end]

            # remove auto contact
            vtmp_1 = np.copy(vk)
            xk_tmp = np.copy(yk)
            id_hetero_contact = np.nonzero(yk != i)[0]

            xk = xk_tmp[id_hetero_contact]
            vtmp = vtmp_1[id_hetero_contact]
            dat = np.float32(vtmp) * fact
            # remove auto contact
            if dat.sum() > 0:
                # tmp_pk = (dat / dat.sum())**fact
                # pk = tmp_pk / tmp_pk.sum()
                pk = dat / np.linalg.norm(dat, 1)
            else:
                tmp = np.ones_like(dat, dtype=np.float32)
                pk = tmp / tmp.sum()

            if len(xk) > 0:
                self.distri_frags[i] = dict()
                # self.distri_frags[i]['distri'] =
                # stats.rv_discrete(name='frag_'+str(i), values=(xk, pk))
                self.distri_frags[i]["distri"] = "ok"
                self.distri_frags[i]["xk"] = xk
                self.distri_frags[i]["pk"] = pk
            else:
                self.distri_frags[i] = dict()
                self.distri_frags[i]["distri"] = None
        logger.info("setup jumping distribution: done")

[docs]    def return_neighbours(self, id_fA, delta0):
        # print "id_frag = ", id_fA

        # delta0 = self.n_neighbours

        # self.n_neighbours = 10

        ori_id = self.gpu_vect_frags.id_d[id_fA]
        # delta = min(self.n_neighbours, delta0)
        delta = delta0

        # DEBUG
        # if ori_id in self.id_frag_duplicated:
        #     delta = max(self.n_neighbors, delta)

        # fact = 3
        # pk = self.distri_frags[ori_id]['pk']**fact
        # distri = pk / pk.sum()

        if self.distri_frags[ori_id]["distri"] is not None:
            distri = self.distri_frags[ori_id]["pk"]
            n_max_candidates = min(delta, np.nonzero(distri != 0)[0].shape[0])

            init_id = np.random.choice(
                self.distri_frags[ori_id]["xk"],
                n_max_candidates,
                p=distri,
                replace=False,
            )
        else:
            init_id = np.random.choice(self.n_frags, delta, replace=False)
        out = []

        if ori_id in self.id_frag_duplicated:
            d = self.frag_dispatcher[ori_id]
            dup = np.lib.arraysetops.setdiff1d(
                self.collector_id_repeats[d["x"] : d["y"]], id_fA
            )
            out.extend(dup)

        for id_fB in init_id:
            d = self.frag_dispatcher[id_fB]
            out.extend(self.collector_id_repeats[d["x"] : d["y"]])

        real_out = []
        for ele in out:
            if ele not in self.id_frags_blacklisted:
                real_out.append(ele)

        return real_out

[docs]    def set_jumping_distributions_parameters(self, delta):
        self.define_neighbourhood()
        self.jump_dictionnary = dict()
        for i in range(0, self.n_frags):
            id_neighbours = self.sorted_neighbours[i, -delta:]
            # id_no_neighbours = self.sorted_neighbours[i, 0:delta]
            scores = np.array(
                self.matrix_normalized[i, id_neighbours], dtype=np.float32
            )
            # val_mean = self.matrix_normalized[i, id_no_neighbours].mean()
            norm_scores = scores / scores.sum()
            self.jump_dictionnary[i] = dict()
            self.jump_dictionnary[i]["proba"] = norm_scores
            self.jump_dictionnary[i]["frags"] = np.array(
                id_neighbours, dtype=np.int32
            )
            self.jump_dictionnary[i]["distri"] = np.zeros(
                (self.n_frags), dtype=np.float32
            )
            self.jump_dictionnary[i]["set_frags"] = set()
            for k in range(0, delta):
                id_frag = self.jump_dictionnary[i]["frags"][k]
                self.jump_dictionnary[i]["set_frags"].add(id_frag)
                proba = self.jump_dictionnary[i]["proba"][k]
                self.jump_dictionnary[i]["distri"][id_frag] = proba
        # test = True
        # while test:
        #     id = raw_input(" give a frag id ?")
        #     id = int(id)
        #     print self.jump_dictionnary[id]
        #     test = raw_input(" keep on ? ")
        #     test = int(test) != 0

[docs]    def temperature(self, t, n_step):
        # T0 = np.float32(6 * 10 ** 3)
        # Tf = np.float32(6*10 ** 2)
        #
        # n_step = n_step
        # limit_rejection = 0.5
        # if t <= n_step * limit_rejection:
        #     val = T0 * (Tf / T0)**(t / (n_step* limit_rejection))
        # else:
        #     val = T0 * (Tf / T0)**(limit_rejection)
        #     # val = Tf
        # # print "temperature = ", val
        val = 1.0
        return val

[docs]    def free_gpu(self,):

        self.gpu_vect_frags.__del__()
        self.rng_states.free()
        (free, total) = cuda.mem_get_info()
        logger.debug(
            (
                "Global memory occupancy after cleaning processes: %f%% free"
                % (free * 100 / total)
            )
        )
        logger.debug(("Global free memory  :%i Mo free" % (free / 10 ** 6)))
        self.ctx.detach()
        del self.module

[docs]    def modify_param_simu(self, param_simu, id_val, val):
        new_param_simu = np.copy(param_simu)
        if id_val == 0:
            new_param_simu["d"] = np.float32(val)
        elif id_val == 1:
            new_param_simu["slope"] = np.float32(val)

        return new_param_simu