Source code for instagraal.simu_single

#!/usr/bin/env python3

# import pp
import os
import instagraal.pyramid_sparse as pyr

# import Image
# from OpenGL.GL import *
import OpenGL.GL
from OpenGL.arrays import vbo
import numpy as np
from instagraal.cuda_lib_gl_single import sampler as sampler_lib

# from cuda_lib_gl import sampler as sampler_lib
import matplotlib.pyplot as plt

from instagraal import log
from instagraal.log import logger

logger.setLevel(log.CURRENT_LOG_LEVEL)

# cuda.init()


[docs]def kth_diag_indices(a, k):
    rows, cols = np.diag_indices_from(a)
    if k < 0:
        return rows[:k], cols[-k:]
    elif k > 0:
        return rows[k:], cols[:-k]
    else:
        return rows, cols


[docs]class simulation:
    def __init__(
        self,
        name,
        folder_path,
        fasta,
        level,
        n_iterations,
        is_simu,
        gl_window,
        use_rippe,
        gl_size_im,
        thresh_factor=1,
        output_folder=None,
    ):
        self.name = name
        self.use_rippe = use_rippe
        self.str_sub_level = str(level - 1)
        self.str_level = str(level)
        self.thresh_factor = thresh_factor

        self.data_set = name

        toolbox_directory = os.path.dirname(os.path.abspath(__file__))

        self.data_set_root = toolbox_directory
        self.dir_home = toolbox_directory
        self.data_set_root = toolbox_directory

        self.fasta = fasta
        # default_level = size_pyramid - 1
        # self.base_folder = os.path.join(
        #     self.data_set_root, self.data_set
        # )
        self.base_folder = folder_path

        if output_folder is None:
            self.output_folder = os.path.join(self.data_set_root, "results")
        else:
            self.output_folder = output_folder

        self.select_data_set(name)
        self.n_iterations = n_iterations
        self.gl_size_im = gl_size_im
        self.int4 = np.dtype(
            [
                ("x", np.int32),
                ("y", np.int32),
                ("z", np.int32),
                ("w", np.int32),
            ],
            align=True,
        )
        self.float3 = np.dtype(
            [("x", np.float32), ("y", np.float32), ("z", np.float32)],
            align=True,
        )
        self.float4 = np.dtype(
            [
                ("x", np.float32),
                ("y", np.float32),
                ("z", np.float32),
                ("w", np.float32),
            ],
            align=True,
        )
        self.int3 = np.dtype(
            [("x", np.int32), ("y", np.int32), ("z", np.int32)], align=True
        )
        self.int2 = np.dtype([("x", np.int32), ("y", np.int32)], align=True)

        self.level = self.hic_pyr.get_level(level)

        self.level.build_seq_per_bin(genome_fasta=self.fasta)
        self.sub_level = self.hic_pyr.get_level(level - 1)

        self.n_frags = self.level.n_frags
        self.create_sub_frags()

        self.mean_squared_frags_per_bin = np.float32(
            (self.collect_accu_frags.mean()) ** 2
        )
        logger.info(
            "mean frag area = {}".format(self.mean_squared_frags_per_bin)
        )
        self.gl_window = gl_window
        # DEFINE REPEATED SEQ ####
        (
            self.candidate_dup,
            self.data_candidate_dup,
            self.sub_candidates_dup,
            self.sub_candidates_output_data,
        ) = self.select_repeated_frags()

        self.modify_vect_frags()
        self.blacklist_contig()
        # self.discard_low_coverage_frags()
        self.load_gl_buffers()
        self.create_new_sub_frags()
        self.modify_sub_vect_frags()

        self.gl_size_im = min(gl_size_im, self.n_frags)
        self.init_gl_image()

        self.sampler = sampler_lib(
            use_rippe,
            self.new_S_o_A_frags,
            self.collector_id_repeats,
            self.frag_dispatcher,
            self.candidate_dup,
            self.frag_blacklisted,
            self.init_n_frags,
            self.n_frags,
            self.init_n_sub_frags,
            self.n_new_sub_frags,
            self.rep_sub_frags_id,
            self.level.sparse_mat_csr,
            self.np_sub_frags_len_bp,
            self.np_sub_frags_id,
            self.np_sub_frags_accu,
            self.np_sub_frags_2_frags,
            self.mean_squared_frags_per_bin,
            self.norm_vect,
            self.sub_candidates_dup,
            self.sub_candidates_output_data,
            self.new_sub_S_o_A_frags,
            self.sub_collector_id_repeats,
            self.sub_frag_dispatcher,
            self.sub_level.sparse_mat_csr,
            self.sub_level.mean_value_trans,
            n_iterations,
            is_simu,
            self.gl_window,
            self.pos_vbo,
            self.col_vbo,
            self.vel,
            self.pos,
            self.raw_im_init,
            self.pbo_im_buffer,
            self.gl_size_im,
        )

        # display_graph = False
        display_graph = True

        id_start = np.nonzero(self.sampler.gpu_vect_frags.start_bp == 0)[0]
        max_dist_kb = (
            self.sampler.gpu_vect_frags.l_cont_bp[id_start].max() / 1000.
        )
        logger.info("max dist kb = {}".format(max_dist_kb))
        # size_bin_kb = self.sampler.gpu_vect_frags.len_bp.mean() / 1000.0
        mean_size_bin_kb = self.new_sub_S_o_A_frags["len_bp"].mean() / 1000.0

        logger.info("mean size kb = {}".format(mean_size_bin_kb))

        logger.info(
            "min fragment length =  {}".format(
                self.new_sub_S_o_A_frags["len_bp"].min() / 1000.0
            )
        )
        if is_simu:
            self.sampler.simulate_rippe_contacts(
                100, 9.6, -1.5, 0.5, 1, 800, 200
            )
        else:
            if self.use_rippe:
                self.sampler.estimate_parameters_rippe(
                    max_dist_kb, mean_size_bin_kb / 2., display_graph
                )
            else:
                self.sampler.estimate_parameters(
                    max_dist_kb, mean_size_bin_kb / 2, display_graph
                )

        # self.sampler.setup_texture()

[docs]    def blacklist_contig(self):
        # list_blacklist_manual = raw_input("ids (separated by space): ")
        list_blacklist_manual = ""
        if list_blacklist_manual != "":
            list_blacklist_manual = list_blacklist_manual.split(" ")
            candidates_blacklist = [int(i) for i in list_blacklist_manual]
        else:
            candidates_blacklist = []
        init_vect_frags = self.level.S_o_A_frags
        list_id_c = init_vect_frags["id_c"]

        # candidates_blacklist =  []

        frag_blacklisted = []

        for id_c_black in candidates_blacklist:
            id_black_list = np.nonzero(list_id_c == id_c_black)[0]
            for init_f in id_black_list:
                dis = self.frag_dispatcher[init_f]
                ids = self.collector_id_repeats[dis["x"] : dis["y"]]
                frag_blacklisted.extend(list(ids))

        self.frag_blacklisted = frag_blacklisted
        for id_f_black in self.frag_blacklisted:
            self.col_vect_frags_4_GL[id_f_black, 0] = np.float32(0)
            self.col_vect_frags_4_GL[id_f_black, 1] = np.float32(0)
            self.col_vect_frags_4_GL[id_f_black, 2] = np.float32(0)
            self.col_vect_frags_4_GL[id_f_black, 3] = np.float32(0)

[docs]    def discard_low_coverage_frags(self):
        mat = np.copy(self.level.im_curr)
        mat_norm = np.array(
            self.norm_vect.T * self.norm_vect, dtype=np.float32
        )
        self.matrix_normalized = mat / mat_norm
        coverage = self.matrix_normalized.sum(axis=1)
        mean_coverage = coverage.mean()
        std_coverage = coverage.std()
        mean_coverage_ext = mean_coverage - 0.1 * std_coverage
        candidates_low = np.nonzero(coverage < mean_coverage_ext)[0]
        logger.info(
            "n discarded frag of low coverage = {}".format(
                candidates_low.shape[0]
            )
        )
        for init_f in candidates_low:
            dis = self.frag_dispatcher[init_f]
            ids = self.collector_id_repeats[dis["x"] : dis["y"]]
            self.frag_blacklisted.extend(list(ids))

[docs]    def modify_vect_frags(self):

        "include repeated frags"
        modified_vect_frags = dict()
        init_vect_frags = self.level.S_o_A_frags

        # init_max_id_d = init_vect_frags["id"].max()
        max_id_F = len(init_vect_frags["id"])
        max_id_C = init_vect_frags["id_c"].max() + 1

        # HSV_tuples = [(x*1.0/(max_id_C - 1), 0.5, 0.5) for x in range(0,
        # (max_id_C-1))]
        # cmap = plt.cm.gist_ncar
        cmap = plt.cm.prism
        # extract all colors from the .jet map
        cmaplist = [cmap(i) for i in range(cmap.N)]
        id_smple = np.linspace(0, cmap.N, num=max_id_C)
        RGB_tuples = []
        for i in range(0, max_id_C - 1):
            RGB_tuples.append(cmaplist[int(id_smple[i])])

        # RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        self.init_n_frags = len(init_vect_frags["id"])

        modified_vect_frags["pos"] = list(init_vect_frags["pos"])
        modified_vect_frags["sub_pos"] = list(init_vect_frags["sub_pos"])
        modified_vect_frags["id_c"] = list(init_vect_frags["id_c"])
        modified_vect_frags["start_bp"] = list(init_vect_frags["start_bp"])
        modified_vect_frags["len_bp"] = list(init_vect_frags["len_bp"])
        modified_vect_frags["sub_len"] = list(init_vect_frags["sub_len"])
        modified_vect_frags["circ"] = list(init_vect_frags["circ"])
        modified_vect_frags["id"] = list(init_vect_frags["id"])
        modified_vect_frags["prev"] = list(init_vect_frags["prev"])
        modified_vect_frags["next"] = list(init_vect_frags["next"])
        modified_vect_frags["l_cont"] = list(init_vect_frags["l_cont"])
        modified_vect_frags["sub_l_cont"] = list(init_vect_frags["sub_l_cont"])
        modified_vect_frags["l_cont_bp"] = list(init_vect_frags["l_cont_bp"])
        modified_vect_frags["n_accu"] = list(init_vect_frags["n_accu"])
        modified_vect_frags["rep"] = list(np.zeros(max_id_F, dtype=np.int32))
        modified_vect_frags["activ"] = list(np.ones(max_id_F, dtype=np.int32))
        modified_vect_frags["id_d"] = list(init_vect_frags["id"])

        for data_dup in self.data_candidate_dup:
            n_dup = int(data_dup[1])
            id_f = data_dup[0]
            for k in range(0, n_dup):
                modified_vect_frags["pos"].append(0)
                modified_vect_frags["sub_pos"].append(0)
                modified_vect_frags["id_c"].append(max_id_C)
                modified_vect_frags["start_bp"].append(0)
                modified_vect_frags["len_bp"].append(
                    init_vect_frags["len_bp"][id_f]
                )
                modified_vect_frags["sub_len"].append(
                    init_vect_frags["sub_len"][id_f]
                )
                modified_vect_frags["circ"].append(
                    init_vect_frags["circ"][id_f]
                )
                modified_vect_frags["id"].append(max_id_F)
                modified_vect_frags["prev"].append(-1)
                modified_vect_frags["next"].append(-1)
                modified_vect_frags["l_cont"].append(1)
                modified_vect_frags["sub_l_cont"].append(
                    init_vect_frags["sub_len"][id_f]
                )
                modified_vect_frags["l_cont_bp"].append(
                    init_vect_frags["len_bp"][id_f]
                )
                modified_vect_frags["n_accu"].append(
                    init_vect_frags["n_accu"][id_f]
                )
                modified_vect_frags["rep"].append(1)
                modified_vect_frags["activ"].append(1)
                modified_vect_frags["id_d"].append(init_vect_frags["id"][id_f])
                max_id_F += 1
                max_id_C += 1

        modified_vect_frags["pos"] = np.array(
            modified_vect_frags["pos"], dtype=np.int32
        )
        modified_vect_frags["sub_pos"] = np.array(
            modified_vect_frags["sub_pos"], dtype=np.int32
        )
        modified_vect_frags["id_c"] = np.array(
            modified_vect_frags["id_c"], dtype=np.int32
        )
        modified_vect_frags["start_bp"] = np.array(
            modified_vect_frags["start_bp"], dtype=np.int32
        )
        modified_vect_frags["len_bp"] = np.array(
            modified_vect_frags["len_bp"], dtype=np.int32
        )
        modified_vect_frags["sub_len"] = np.array(
            modified_vect_frags["sub_len"], dtype=np.int32
        )
        modified_vect_frags["circ"] = np.array(
            modified_vect_frags["circ"], dtype=np.int32
        )
        modified_vect_frags["id"] = np.array(
            modified_vect_frags["id"], dtype=np.int32
        )
        modified_vect_frags["prev"] = np.array(
            modified_vect_frags["prev"], dtype=np.int32
        )
        modified_vect_frags["next"] = np.array(
            modified_vect_frags["next"], dtype=np.int32
        )
        modified_vect_frags["l_cont"] = np.array(
            modified_vect_frags["l_cont"], dtype=np.int32
        )
        modified_vect_frags["sub_l_cont"] = np.array(
            modified_vect_frags["sub_l_cont"], dtype=np.int32
        )
        modified_vect_frags["l_cont_bp"] = np.array(
            modified_vect_frags["l_cont_bp"], dtype=np.int32
        )
        modified_vect_frags["n_accu"] = np.array(
            modified_vect_frags["n_accu"], dtype=np.int32
        )
        modified_vect_frags["rep"] = np.array(
            modified_vect_frags["rep"], dtype=np.int32
        )
        modified_vect_frags["activ"] = np.array(
            modified_vect_frags["activ"], dtype=np.int32
        )
        modified_vect_frags["id_d"] = np.array(
            modified_vect_frags["id_d"], dtype=np.int32
        )

        id_x = 0
        collector_id_repeats = []
        frag_dispatcher = []
        # BUILD LINK BETWEEN FRAGS AND SUB FRAGS
        for id_f in range(0, self.init_n_frags):
            if id_f in self.candidate_dup:
                id_start = id_x
                id_dup = np.nonzero(modified_vect_frags["id_d"] == id_f)[0]
                collector_id_repeats.extend(list(id_dup))
                n_rep = len(id_dup)
                frag_dispatcher.append(
                    (np.int32(id_start), np.int32(id_start + n_rep))
                )
                id_x += n_rep
            else:
                id_start = id_x
                n_rep = 1
                frag_dispatcher.append(
                    (np.int32(id_start), np.int32(id_start + n_rep))
                )
                collector_id_repeats.append(id_f)
                id_x += 1

        self.collector_id_repeats = np.array(
            collector_id_repeats, dtype=np.int32
        )
        self.frag_dispatcher = np.array(frag_dispatcher, dtype=self.int2)

        self.n_frags = len(modified_vect_frags["id"])

        pos_vect_frags_4_GL = np.ndarray((self.n_frags, 4), dtype=np.float32)
        col_vect_frags_4_GL = np.ndarray((self.n_frags, 4), dtype=np.float32)

        for id_f_curr in range(0, self.n_frags):
            id_d = modified_vect_frags["id_d"][id_f_curr]
            id_c = init_vect_frags["id_c"][id_d]
            pos_vect_frags_4_GL[id_f_curr, 0] = modified_vect_frags["pos"][
                id_f_curr
            ]
            pos_vect_frags_4_GL[id_f_curr, 1] = modified_vect_frags["id_c"][
                id_f_curr
            ]
            pos_vect_frags_4_GL[id_f_curr, 2] = 0.
            pos_vect_frags_4_GL[id_f_curr, 3] = np.float32(1.0)

            col_vect_frags_4_GL[id_f_curr, 0] = np.float32(
                RGB_tuples[id_c - 1][0]
            )
            col_vect_frags_4_GL[id_f_curr, 1] = np.float32(
                RGB_tuples[id_c - 1][1]
            )
            col_vect_frags_4_GL[id_f_curr, 2] = np.float32(
                RGB_tuples[id_c - 1][2]
            )
            col_vect_frags_4_GL[id_f_curr, 3] = np.float32(1.0)

        self.col_vect_frags_4_GL = col_vect_frags_4_GL
        self.pos_vect_frags_4_GL = pos_vect_frags_4_GL
        self.new_S_o_A_frags = modified_vect_frags

[docs]    def modify_sub_vect_frags(self):
        "include repeated frags"
        modified_vect_frags = dict()
        init_vect_frags = self.sub_level.S_o_A_frags

        # init_max_id_d = init_vect_frags["id"].max()
        max_id_F = len(init_vect_frags["id"])
        max_id_C = init_vect_frags["id_c"].max() + 1

        # HSV_tuples = [(x*1.0/(max_id_C - 1), 0.5, 0.5) for x in range(0,
        # (max_id_C-1))]
        # cmap = plt.cm.gist_ncar
        cmap = plt.cm.prism
        # extract all colors from the .jet map
        cmaplist = [cmap(i) for i in range(cmap.N)]
        id_smple = np.linspace(0, cmap.N, num=max_id_C)
        RGB_tuples = []
        for i in range(0, max_id_C - 1):
            RGB_tuples.append(cmaplist[int(id_smple[i])])

        # RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        self.init_n_sub_frags = len(init_vect_frags["id"])

        modified_vect_frags["pos"] = list(init_vect_frags["pos"])
        modified_vect_frags["sub_pos"] = list(init_vect_frags["sub_pos"])
        modified_vect_frags["id_c"] = list(init_vect_frags["id_c"])
        modified_vect_frags["start_bp"] = list(init_vect_frags["start_bp"])
        modified_vect_frags["len_bp"] = list(init_vect_frags["len_bp"])
        modified_vect_frags["sub_len"] = list(init_vect_frags["sub_len"])
        modified_vect_frags["circ"] = list(init_vect_frags["circ"])
        modified_vect_frags["id"] = list(init_vect_frags["id"])
        modified_vect_frags["prev"] = list(init_vect_frags["prev"])
        modified_vect_frags["next"] = list(init_vect_frags["next"])
        modified_vect_frags["l_cont"] = list(init_vect_frags["l_cont"])
        modified_vect_frags["sub_l_cont"] = list(init_vect_frags["sub_l_cont"])
        modified_vect_frags["l_cont_bp"] = list(init_vect_frags["l_cont_bp"])
        modified_vect_frags["n_accu"] = list(init_vect_frags["n_accu"])
        modified_vect_frags["rep"] = list(np.zeros(max_id_F, dtype=np.int32))
        modified_vect_frags["activ"] = list(np.ones(max_id_F, dtype=np.int32))
        modified_vect_frags["id_d"] = list(init_vect_frags["id"])
        # WARNING IMPLICT BREAKING OF THE CONTIGS
        for data_dup in self.sub_candidates_output_data:
            n_dup = int(data_dup[1])
            id_f = data_dup[0]
            for k in range(0, n_dup):
                modified_vect_frags["pos"].append(0)
                modified_vect_frags["sub_pos"].append(0)
                modified_vect_frags["id_c"].append(max_id_C)
                modified_vect_frags["start_bp"].append(0)
                modified_vect_frags["len_bp"].append(
                    init_vect_frags["len_bp"][id_f]
                )
                modified_vect_frags["sub_len"].append(
                    init_vect_frags["sub_len"][id_f]
                )
                modified_vect_frags["circ"].append(
                    init_vect_frags["circ"][id_f]
                )
                modified_vect_frags["id"].append(max_id_F)
                modified_vect_frags["prev"].append(-1)
                modified_vect_frags["next"].append(-1)
                modified_vect_frags["l_cont"].append(1)
                modified_vect_frags["sub_l_cont"].append(
                    init_vect_frags["sub_len"][id_f]
                )
                modified_vect_frags["l_cont_bp"].append(
                    init_vect_frags["len_bp"][id_f]
                )
                modified_vect_frags["n_accu"].append(
                    init_vect_frags["n_accu"][id_f]
                )
                modified_vect_frags["rep"].append(1)
                modified_vect_frags["activ"].append(1)
                modified_vect_frags["id_d"].append(init_vect_frags["id"][id_f])
                max_id_F += 1
                max_id_C += 1

        logger.info("MAX ID CONTIG = {}".format(max_id_C))

        modified_vect_frags["pos"] = np.array(
            modified_vect_frags["pos"], dtype=np.int32
        )
        modified_vect_frags["sub_pos"] = np.array(
            modified_vect_frags["sub_pos"], dtype=np.int32
        )
        modified_vect_frags["id_c"] = np.array(
            modified_vect_frags["id_c"], dtype=np.int32
        )
        modified_vect_frags["start_bp"] = np.array(
            modified_vect_frags["start_bp"], dtype=np.int32
        )
        modified_vect_frags["len_bp"] = np.array(
            modified_vect_frags["len_bp"], dtype=np.int32
        )
        modified_vect_frags["sub_len"] = np.array(
            modified_vect_frags["sub_len"], dtype=np.int32
        )
        modified_vect_frags["circ"] = np.array(
            modified_vect_frags["circ"], dtype=np.int32
        )
        modified_vect_frags["id"] = np.array(
            modified_vect_frags["id"], dtype=np.int32
        )
        modified_vect_frags["prev"] = np.array(
            modified_vect_frags["prev"], dtype=np.int32
        )
        modified_vect_frags["next"] = np.array(
            modified_vect_frags["next"], dtype=np.int32
        )
        modified_vect_frags["l_cont"] = np.array(
            modified_vect_frags["l_cont"], dtype=np.int32
        )
        modified_vect_frags["sub_l_cont"] = np.array(
            modified_vect_frags["sub_l_cont"], dtype=np.int32
        )
        modified_vect_frags["l_cont_bp"] = np.array(
            modified_vect_frags["l_cont_bp"], dtype=np.int32
        )
        modified_vect_frags["n_accu"] = np.array(
            modified_vect_frags["n_accu"], dtype=np.int32
        )
        modified_vect_frags["rep"] = np.array(
            modified_vect_frags["rep"], dtype=np.int32
        )
        modified_vect_frags["activ"] = np.array(
            modified_vect_frags["activ"], dtype=np.int32
        )
        modified_vect_frags["id_d"] = np.array(
            modified_vect_frags["id_d"], dtype=np.int32
        )

        id_x = 0
        collector_id_repeats = []
        frag_dispatcher = []
        for id_f in range(0, self.init_n_sub_frags):
            if id_f in self.sub_candidates_dup:
                id_start = id_x
                id_dup = np.nonzero(modified_vect_frags["id_d"] == id_f)[0]
                collector_id_repeats.extend(list(id_dup))
                n_rep = len(id_dup)
                frag_dispatcher.append(
                    (np.int32(id_start), np.int32(id_start + n_rep))
                )
                id_x += n_rep
            else:
                id_start = id_x
                n_rep = 1
                frag_dispatcher.append(
                    (np.int32(id_start), np.int32(id_start + n_rep))
                )
                collector_id_repeats.append(id_f)
                id_x += 1

        self.sub_collector_id_repeats = np.array(
            collector_id_repeats, dtype=np.int32
        )
        self.sub_frag_dispatcher = np.array(frag_dispatcher, dtype=self.int2)

        self.sub_n_frags = len(modified_vect_frags["id"])

        # pos_vect_frags_4_GL = np.ndarray((self.n_frags, 4), dtype=np.float32)
        # col_vect_frags_4_GL = np.ndarray((self.n_frags, 4), dtype=np.float32)
        #
        # for id_f_curr in xrange(0 , self.sub_n_frags):
        #     id_d = modified_vect_frags['id_d'][id_f_curr]
        #     id_c = init_vect_frags['id_c'][id_d]
        #     pos_vect_frags_4_GL[id_f_curr, 0] =
        #     modified_vect_frags['pos'][id_f_curr]
        #     pos_vect_frags_4_GL[id_f_curr, 1] =
        #     modified_vect_frags['id_c'][id_f_curr]
        #     pos_vect_frags_4_GL[id_f_curr, 2] = 0.
        #     pos_vect_frags_4_GL[id_f_curr, 3] = np.float32(1.0)
        #
        #     col_vect_frags_4_GL[id_f_curr, 0] =
        #     np.float32(RGB_tuples[id_c - 1][0])
        #     col_vect_frags_4_GL[id_f_curr, 1] =
        #     np.float32(RGB_tuples[id_c - 1][1])
        #     col_vect_frags_4_GL[id_f_curr, 2] =
        #     np.float32(RGB_tuples[id_c - 1][2])
        #     col_vect_frags_4_GL[id_f_curr, 3] =
        #     np.float32(1.0)
        #
        # self.sub_col_vect_frags_4_GL = col_vect_frags_4_GL
        # self.sub_pos_vect_frags_4_GL = pos_vect_frags_4_GL
        self.new_sub_S_o_A_frags = modified_vect_frags

[docs]    def select_repeated_frags(self):  #

        # collect_cov = []
        # step = 0
        # p = ProgressBar('green', width=20, block='▣', empty='□')
        # for i in range(0, self.level.n_frags):
        #     v_r = self.level.sparse_mat_csr[i, :]
        #     v_c = self.level.sparse_mat_csc[:, i]
        #     non_zeros = v_c.nnz + v_r.nnz
        #     collect_cov.append(non_zeros)
        #     step += 1
        #     if step%1000 == 0:
        #         pt = step * 100 / self.level.n_frags
        #         p.render(pt, 'step %s\nProcessing...\nDescription: computing
        #         coverage per frag.' % step)

        coverage = np.array(self.level.sparse_mat_csr.sum(axis=0))[0]
        coverage += np.array(
            self.level.sparse_mat_csr.transpose().sum(axis=0)
        )[0]
        mean_coverage = coverage.mean()

        std_coverage = coverage.std()
        mean_coverage_ext = mean_coverage + 3 * std_coverage
        candidates_dup = np.nonzero(coverage > mean_coverage_ext)[0]
        # plt.figure()
        # plt.hist(coverage, 100)
        # plt.figure()
        # plt.plot(coverage)
        # plt.axhline(mean_coverage_ext, color='g')
        # # plt.show()
        # plt.figure()
        # n, bins, patches = plt.hist(coverage, 10000, normed=1,
        # facecolor='blue', alpha=0.75)
        # # add a 'best fit' line
        # (mu, sigma) = norm.fit(coverage)
        # y = mlab.normpdf( bins, mu, sigma)
        # l = plt.plot(bins, y, 'r--', linewidth=2)
        # plt.axvline(mean_coverage_ext, color='g', linewidth=2)
        # #plot
        # plt.xlabel('Raw contacts frequency')
        # plt.ylabel('Probability')
        # plt.title(r'$\mathrm{Histogram\ of\ HiC\ contact\ (data\ %s):}\
        # \mu=%.3f,\ \sigma=%.3f$' %(self.name, mu, sigma))
        # plt.legend(["gaussian fit","duplication limit","exp distribution"],
        # prop={'size':15})
        # plt.grid(True)
        #
        # plt.show()
        # DEBUGGGG ###############s##########
        # candidates_dup = []
        # print "candidate frags for duplication = ", candidates_dup
        test = ""
        # test = raw_input("ok?")

        if not (test == ""):
            # print "---enter id duplicated frags--"
            list_dup_manual = input("ids (separated by space): ")
            if list_dup_manual != "":
                list_dup_manual = list_dup_manual.split(" ")
                candidates_dup = [int(i) for i in list_dup_manual]
            else:
                candidates_dup = []
        # DEBUGGGG #########################
        # candidates_dup = range(880, 890)
        # candidates_dup = range(663, 674)
        # candidates_dup = range(0, 800)
        # candidates_dup = range(1204, 1217)

        candidates_dup = []
        # candidates_dup = range(1978, 2009)
        # print "you have selected: ", candidates_dup
        output_data = []
        sub_candidates_dup = []
        sub_candidates_output_data = []
        for ele in candidates_dup:
            cov_ele = coverage[ele]
            estim_n_dup = np.max(
                [1, np.round(cov_ele / mean_coverage_ext) - 1]
            )
            # estim_n_dup = np.max([1, np.ceil(cov_ele / mean_coverage_ext) -
            # 1])
            output_data.append((ele, estim_n_dup))
            # print "duplicated data = ", output_data
            tmp_sub_ids = self.np_sub_frags_id[ele]
            n_subs = tmp_sub_ids[-1]
            for i in range(0, n_subs):
                sub_candidates_dup.append(tmp_sub_ids[i])
                sub_candidates_output_data.append(
                    (tmp_sub_ids[i], estim_n_dup)
                )
        logger.info("N frag duplicated = {}".format(len(candidates_dup)))
        return (
            candidates_dup,
            output_data,
            sub_candidates_dup,
            sub_candidates_output_data,
        )

[docs]    def select_data_set(self, name):

        size_pyramid = 9
        factor = 3

        self.hic_pyr = pyr.build_and_filter(
            self.base_folder,
            size_pyramid,
            factor,
            thresh_factor=self.thresh_factor,
        )
        logger.info("pyramid loaded")

        if not (os.path.exists(self.output_folder)):
            os.mkdir(self.output_folder)
        self.output_folder = os.path.join(self.output_folder, self.data_set)
        if not (os.path.exists(self.output_folder)):
            os.mkdir(self.output_folder)
        self.output_folder = os.path.join(
            self.output_folder, "test_mcmc_" + self.str_level
        )
        if not (os.path.exists(self.output_folder)):
            os.mkdir(self.output_folder)

        self.new_fasta = os.path.join(self.output_folder, "genome.fasta")
        self.info_frags = os.path.join(self.output_folder, "info_frags.txt")
        self.output_matrix_em = os.path.join(
            self.output_folder, "post_em.tiff"
        )
        self.output_matrix_mcmc = os.path.join(
            self.output_folder, "post_mcmc.tiff"
        )
        self.input_matrix = os.path.join(self.output_folder, "pre_simu.tiff")
        self.scrambled_input_matrix = os.path.join(
            self.output_folder, "scrambled_simu.tiff"
        )

[docs]    def load_gl_buffers(self):
        num = self.n_frags
        pos = np.ndarray((num, 4), dtype=np.float32)
        seed = np.random.rand(2, num)
        pos[:, 0] = seed[0, :]
        pos[:, 1] = 0.0
        pos[:, 2] = seed[1, :]  # z pos
        pos[:, 3] = 1.  # velocity

        # num = self.n_frags
        # pos = np.ndarray((num, 4), dtype=np.float32)
        # pos[:,1] = np.sin(np.arange(0., num) * 2.001 * np.pi / (10*num))
        # pos[:,1] *= np.random.random_sample((num,)) / 3. - 0.2
        # pos[:,2] = np.cos(np.arange(0., num) * 2.001 * np.pi /(10* num))
        # pos[:,2] *= np.random.random_sample((num,)) / 3. - 0.2
        # pos[:,0] = 0. # z pos
        # pos[:,3] = 1. # velocity
        self.pos = pos
        self.pos_vbo = vbo.VBO(
            data=self.pos,
            usage=OpenGL.GL.GL_DYNAMIC_DRAW,
            target=OpenGL.GL.GL_ARRAY_BUFFER,
        )

        # self.pos_vbo = vbo.VBO(data=self.pos_vect_frags_4_GL,
        # usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER)
        self.pos_vbo.bind()

        self.col_vbo = vbo.VBO(
            data=self.col_vect_frags_4_GL,
            usage=OpenGL.GL.GL_DYNAMIC_DRAW,
            target=OpenGL.GL.GL_ARRAY_BUFFER,
        )

        self.col_vbo.bind()

        self.vel = np.ndarray((self.n_frags, 4), dtype=np.float32)
        self.vel[:, 2] = self.pos[:, 2] * 2.
        self.vel[:, 1] = self.pos[:, 1] * 2.
        self.vel[:, 0] = 3.
        self.vel[:, 3] = np.random.random_sample((self.n_frags,))

[docs]    def init_gl_image(self,):

        self.texid = 0
        self.pbo_im_buffer = OpenGL.GL.glGenBuffers(
            1
        )  # generate 1 buffer reference

        OpenGL.GL.glBindBuffer(
            OpenGL.GL.GL_PIXEL_UNPACK_BUFFER, self.pbo_im_buffer
        )  # binding to this buffer

        # self.raw_im_init = np.uint8(np.random.rand(self.gl_size_im,
        # self.gl_size_im))
        self.raw_im_init = np.zeros(
            (self.gl_size_im, self.gl_size_im), dtype=np.uint8
        )

        OpenGL.GL.glBufferData(
            OpenGL.GL.GL_PIXEL_UNPACK_BUFFER,
            self.gl_size_im * self.gl_size_im,
            self.raw_im_init,
            OpenGL.GL.GL_STREAM_DRAW,
        )  # Allocate the buffer

        OpenGL.GL.glGetBufferParameteriv(
            OpenGL.GL.GL_PIXEL_UNPACK_BUFFER, OpenGL.GL.GL_BUFFER_SIZE
        )  # Check allocated buffer size

        #        try:
        #            assert(bsize == self.gl_size_im * self.gl_size_im)
        #        except AssertionError as e:
        #            print str(e)
        OpenGL.GL.glBindBuffer(OpenGL.GL.GL_PIXEL_UNPACK_BUFFER, 0)  # Unbind

        OpenGL.GL.glGenTextures(1, self.texid)  # generate 1 texture reference

        OpenGL.GL.glBindTexture(
            OpenGL.GL.GL_TEXTURE_2D, self.texid
        )  # binding to this texture

        OpenGL.GL.glTexParameteri(
            OpenGL.GL.GL_TEXTURE_2D,
            OpenGL.GL.GL_TEXTURE_MAG_FILTER,
            OpenGL.GL.GL_LINEAR,
        )
        OpenGL.GL.glTexParameteri(
            OpenGL.GL.GL_TEXTURE_2D,
            OpenGL.GL.GL_TEXTURE_MIN_FILTER,
            OpenGL.GL.GL_LINEAR,
        )

        # glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, self.gl_size_im,
        # self.gl_size_im,  0, GL_LUMINANCE, GL_FLOAT, self.raw_im_init) #
        # Allocate the texture
        OpenGL.GL.glTexImage2D(
            OpenGL.GL.GL_TEXTURE_2D,
            0,
            OpenGL.GL.GL_LUMINANCE,
            self.gl_size_im,
            self.gl_size_im,
            0,
            OpenGL.GL.GL_LUMINANCE,
            OpenGL.GL.GL_UNSIGNED_BYTE,
            None,
        )  # Allocate the texture

        OpenGL.GL.glBindTexture(OpenGL.GL.GL_TEXTURE_2D, 0)  # Unbind

        OpenGL.GL.glPixelStorei(
            OpenGL.GL.GL_UNPACK_ALIGNMENT, 1
        )  # 1-byte row alignment
        OpenGL.GL.glPixelStorei(
            OpenGL.GL.GL_PACK_ALIGNMENT, 1
        )  # 1-byte row alignment

[docs]    def create_sub_frags(self):
        self.sub_frags_len_bp = []
        self.sub_frags_id = []
        self.sub_frags_accu = []
        unkb = np.float32(1000.0)
        self.collect_accu_frags = []
        self.norm_vect = []
        n_sub_frags = 0
        self.sub_frags_2_frags = []
        for i in range(0, self.n_frags):
            tmp = [
                self.hic_pyr.spec_level[self.str_level]["fragments_dict"][
                    i + 1
                ]["sub_low_index"]
                - 1,
                self.hic_pyr.spec_level[self.str_level]["fragments_dict"][
                    i + 1
                ]["sub_high_index"]
                - 1,
            ]

            # !!! warning!!! m
            n_sub = tmp[1] - tmp[0] + 1
            v_len = [0, 0, 0]
            v_accu = [0, 0, 0]
            v_id = [0, 0, 0, n_sub]
            n_sub_frags += n_sub
            for j in range(0, n_sub):
                v_len[j] = (
                    np.float32(
                        self.sub_level.vect_frag_np[tmp[0] + j]["len_bp"]
                    )
                    / unkb
                )
                v_id[j] = np.int32(tmp[0] + j)
                v_accu[j] = np.int32(
                    self.sub_level.vect_frag_np[tmp[0] + j]["n_accu"]
                )
                self.collect_accu_frags.append(v_accu[j])

            # create index to go from high res frags to low res frags
            tmp_len = np.array(v_len[0:n_sub], dtype=np.float32)
            for j in range(0, n_sub):
                w_d = np.sum(tmp_len[0:j]) + tmp_len[j] / 2.  # watson distance
                c_d = (
                    np.sum(tmp_len[list(range(n_sub - 1, j, -1))])
                    + tmp_len[j] / 2.
                )  # crick distance
                # dat = (i, w_d, c_d)
                dat = (i, w_d, c_d, j)
                self.sub_frags_2_frags.append(dat)

            self.norm_vect.append(np.sum(v_accu))
            self.sub_frags_len_bp.append(tuple(v_len))
            self.sub_frags_id.append(tuple(v_id))
            self.sub_frags_accu.append(tuple(v_accu))

        # self.np_sub_frags_2_frags = np.array(self.sub_frags_2_frags,
        # dtype=self.float3)
        self.np_sub_frags_2_frags = np.array(
            self.sub_frags_2_frags, dtype=self.float4
        )
        self.np_sub_frags_len_bp = np.array(
            self.sub_frags_len_bp, dtype=self.float3
        )
        self.np_sub_frags_accu = np.array(self.sub_frags_accu, dtype=self.int3)
        self.np_sub_frags_id = np.array(self.sub_frags_id, dtype=self.int4)
        self.collect_accu_frags = np.array(
            self.collect_accu_frags, dtype=np.float32
        )
        self.norm_vect = np.mat(self.norm_vect)
        self.init_n_sub_frags = n_sub_frags

[docs]    def create_new_sub_frags(self,):
        out = 0
        rep_sub_frags_id = []
        idx = 0
        for i in range(0, self.n_frags):
            id_d = self.new_S_o_A_frags["id_d"][i]
            n_sub = self.np_sub_frags_id[id_d]["w"]
            v_id = [0, 0, 0, n_sub]
            for j in range(0, n_sub):
                v_id[j] = np.int32(idx)
                idx += 1
            rep_sub_frags_id.append(tuple(v_id))
            out += n_sub
        self.n_new_sub_frags = out
        self.rep_sub_frags_id = np.array(rep_sub_frags_id, dtype=self.int4)

[docs]    def plot_info_simu(
        self,
        collect_likelihood_input,
        collect_n_contigs_input,
        file_plot,
        title_ax,
    ):
        collect_likelihood = np.array(collect_likelihood_input)
        collect_n_contigs = np.array(collect_n_contigs_input)
        len_collect = len(collect_likelihood)
        if len_collect > 1000:
            idx_2_plot = np.arange(1000, len_collect)
        else:
            idx_2_plot = np.arange(0, len_collect)

        fig = plt.figure(figsize=(10, 10), dpi=100)
        ax1 = fig.add_subplot(111)
        ax1.plot(collect_likelihood[idx_2_plot], "r-")
        ax1.set_xlabel("iterations")
        # Make the y-axis label and tick labels match the line color.
        ax1.set_ylabel("likelihood", color="r")
        for tl in ax1.get_yticklabels():
            tl.set_color("r")
        ax2 = ax1.twinx()
        # if title_ax == "distance from init genome":
        #     ax2.semilogy(collect_n_contigs, 'b-')
        # else:
        ax2.plot(collect_n_contigs[idx_2_plot], "b-")
        ax2.set_ylabel(title_ax, color="b")
        for tl in ax2.get_yticklabels():
            tl.set_color("b")
        plt.show()
        fig.savefig(file_plot)

        if len_collect > 1000:
            plt.figure()
            plt.hist(collect_n_contigs[idx_2_plot], 100)
            plt.title("histogram " + title_ax)
            plt.xlabel(title_ax)
            plt.ylabel("counts")
            plt.show()

    # def plot_all_info_simu(self, all_data, headers):
    #     print "here we go!!"
    #     folder = self.output_folder
    #     ## generate files ##
    #     n_vals =
    #     # the main axes is subplot(111) by default
    #     plt.plot(t, s)
    #     plt.axis([0, 1, 1.1*amin(s), 2*amax(s) ])
    #     plt.xlabel('time (s)')
    #     plt.ylabel('current (nA)')
    #     plt.title('Gaussian colored noise')
    #
    #     # this is an inset axes over the main axes
    #     a = plt.axes([.65, .6, .2, .2], axisbg='w')
    #     n, bins, patches = plt.hist(s, 400, normed=1)
    #     plt.title('Counts')
    #     plt.setp(a, xticks=[], yticks=[])
    #     plt.show()

[docs]    def export_new_fasta(self):
        self.sampler.gpu_vect_frags.copy_from_gpu()
        self.level.generate_new_fasta(
            self.sampler.gpu_vect_frags, self.new_fasta, self.info_frags
        )

[docs]    def release(self):
        self.sampler.free_gpu()