Source code for instagraal.pyramid_sparse

#!/usr/bin/env python3
"""Pyramid library

Create and handle so-called 'pyramid' objects, i.e. a series of
decreasing-resolution contact maps in hdf5 format.
"""

import os
import shutil
import h5py
import sys
import numpy as np
from instagraal.fragment import basic_fragment as B_frag
from matplotlib import pyplot as plt
import scipy.sparse as sp
from instagraal import log
from instagraal.log import logger

logger.setLevel(log.CURRENT_LOG_LEVEL)


[docs]def file_len(fname): with open(fname) as f: for i, _ in enumerate(f, 1): pass return i
[docs]def build_and_filter(base_folder, size_pyramid, factor, thresh_factor=1): """Build a filtered pyramid of contact maps Build a fragment pyramid for multi-scale analysis and remove high sparsity (i.e. low-coverage) and short fragments. Parameters ---------- base_folder : str or pathlib.Path Where to create the hdf5 files containing the matrices. size_pyramid : int How many levels (contact maps of decreasing resolution) to generate. factor : int Subsampling factor (binning) from one level to the next. thresh_factor : float, optional Number of standard deviations below the mean coverage beyond which lesser covered fragments will be discarded. Default is 1. Returns ------- obj_pyramid : Pyramid The pyramid object containing all the levels. """ min_bin_per_contig = 1 fact_sub_sampling = factor all_pyramid_folder = os.path.join(base_folder, "pyramids") if not (os.path.exists(all_pyramid_folder)): os.mkdir(all_pyramid_folder) init_pyramid_folder = os.path.join( all_pyramid_folder, "pyramid_" + str(1) + "_no_thresh" ) if not (os.path.exists(init_pyramid_folder)): init_size_pyramid = 1 build(base_folder, init_size_pyramid, factor, min_bin_per_contig) init_pyramid_folder_level_0 = os.path.join(init_pyramid_folder, "level_0") contig_info = os.path.join( init_pyramid_folder_level_0, "0_contig_info.txt" ) fragments_list = os.path.join( init_pyramid_folder_level_0, "0_fragments_list.txt" ) init_abs_fragments_contacts = os.path.join( init_pyramid_folder_level_0, "0_abs_frag_contacts.txt" ) init_pyramid_file = os.path.join(init_pyramid_folder, "pyramid.hdf5") pyramid_folder = os.path.join( all_pyramid_folder, "pyramid_" + str(size_pyramid) + "_thresh_auto" ) if not (os.path.exists(pyramid_folder)): os.mkdir(pyramid_folder) level = 0 pyramid_level_folder = os.path.join(pyramid_folder, "level_" + str(level)) if not (os.path.exists(pyramid_level_folder)): os.mkdir(pyramid_level_folder) current_contig_info = os.path.join( pyramid_level_folder, str(level) + "_contig_info.txt" ) current_frag_list = os.path.join( pyramid_level_folder, str(level) + "_fragments_list.txt" ) current_abs_fragments_contacts = os.path.join( pyramid_level_folder, str(level) + "_abs_frag_contacts.txt" ) if not ( os.path.exists(current_contig_info) and os.path.exists(current_frag_list) and os.path.exists(current_abs_fragments_contacts) ): logger.info("start filtering") pyramid_0 = h5py.File(init_pyramid_file) remove_problematic_fragments( contig_info, fragments_list, init_abs_fragments_contacts, current_contig_info, current_frag_list, current_abs_fragments_contacts, pyramid_0, thresh_factor=thresh_factor, ) pyramid_0.close() # else: logger.info("filtering already done...") hdf5_pyramid_file = os.path.join(pyramid_folder, "pyramid.hdf5") pyramid_handle = h5py.File(hdf5_pyramid_file) pyramid_level_folder = os.path.join(pyramid_folder, "level_" + str(level)) level_pyramid = str(level) + "_" sub_2_super_frag_index_file = os.path.join( pyramid_level_folder, level_pyramid + "sub_2_super_index_frag.txt" ) for level in range(0, size_pyramid): pyramid_level_folder = os.path.join( pyramid_folder, "level_" + str(level) ) if not (os.path.exists(pyramid_level_folder)): os.mkdir(pyramid_level_folder) level_pyramid = str(level) + "_" new_contig_list_file = os.path.join( pyramid_level_folder, level_pyramid + "contig_info.txt" ) new_fragments_list_file = os.path.join( pyramid_level_folder, level_pyramid + "fragments_list.txt" ) new_abs_fragments_contacts_file = os.path.join( pyramid_level_folder, level_pyramid + "abs_frag_contacts.txt" ) if level > 0: if ( os.path.exists(new_contig_list_file) and os.path.exists(new_fragments_list_file) and os.path.exists(new_abs_fragments_contacts_file) and os.path.exists((sub_2_super_frag_index_file)) ): logger.info("level already built") nfrags = file_len(new_fragments_list_file) - 1 else: # this should never append !!! logger.info("writing new_files..") nfrags = subsample_data_set( current_contig_info, current_frag_list, fact_sub_sampling, current_abs_fragments_contacts, new_abs_fragments_contacts_file, min_bin_per_contig, new_contig_list_file, new_fragments_list_file, sub_2_super_frag_index_file, ) else: if ( os.path.exists(new_contig_list_file) and os.path.exists(new_fragments_list_file) and os.path.exists(new_abs_fragments_contacts_file) ): logger.info("level already built...") nfrags = file_len(new_fragments_list_file) - 1 try: status = pyramid_handle.attrs[str(level)] == "done" except KeyError: pyramid_handle.attrs[str(level)] = "pending" status = False if not (status): logger.info("Start filling the pyramid") # level_to_fill = pyramid_handle.create_dataset(str(level), # (nfrags,nfrags), 'i') # fill_pyramid_level(level_to_fill,new_abs_fragments_contacts_file, # size_chunk,nfrags) fill_sparse_pyramid_level( pyramid_handle, level, new_abs_fragments_contacts_file, nfrags ) pyramid_handle.attrs[str(level)] = "done" current_frag_list = new_fragments_list_file current_contig_info = new_contig_list_file current_abs_fragments_contacts = new_abs_fragments_contacts_file sub_2_super_frag_index_file = os.path.join( pyramid_level_folder, level_pyramid + "sub_2_super_index_frag.txt" ) logger.info("pyramid built.") obj_pyramid = pyramid(pyramid_folder, size_pyramid) pyramid_handle.close() return obj_pyramid
[docs]def build(base_folder, size_pyramid, factor, min_bin_per_contig): """Build a pyramid of contact maps Build a fragment pyramid for multi-scale analysis Parameters ---------- base_folder : str or pathlib.Path Where to create the hdf5 files containing the matrices. size_pyramid : int How many levels (contact maps of decreasing resolution) to generate. factor : int Subsampling factor (binning) from one level to the next. min_bin_per_contig : int The minimum number of bins per contig below which binning shall not be performed. """ fact_sub_sampling = factor contig_info = os.path.join(base_folder, "info_contigs.txt") fragments_list = os.path.join(base_folder, "fragments_list.txt") init_abs_fragments_contacts = os.path.join( base_folder, "abs_fragments_contacts_weighted.txt" ) all_pyramid_folder = os.path.join(base_folder, "pyramids") pyramid_folder = os.path.join( all_pyramid_folder, "pyramid_" + str(size_pyramid) + "_no_thresh" ) if not (os.path.exists(all_pyramid_folder)): os.mkdir(all_pyramid_folder) if not (os.path.exists(pyramid_folder)): os.mkdir(pyramid_folder) hdf5_pyramid_file = os.path.join(pyramid_folder, "pyramid.hdf5") pyramid_handle = h5py.File(hdf5_pyramid_file) level = 0 pyramid_level_folder = os.path.join(pyramid_folder, "level_" + str(level)) if not (os.path.exists(pyramid_level_folder)): os.mkdir(pyramid_level_folder) current_contig_info = os.path.join( pyramid_level_folder, str(level) + "_contig_info.txt" ) current_frag_list = os.path.join( pyramid_level_folder, str(level) + "_fragments_list.txt" ) current_abs_fragments_contacts = os.path.join( pyramid_level_folder, str(level) + "_abs_frag_contacts.txt" ) for level in range(0, size_pyramid): pyramid_level_folder = os.path.join( pyramid_folder, "level_" + str(level) ) if not (os.path.exists(pyramid_level_folder)): os.mkdir(pyramid_level_folder) level_pyramid = str(level) + "_" if level == 0: shutil.copyfile(contig_info, current_contig_info) shutil.copyfile( init_abs_fragments_contacts, current_abs_fragments_contacts ) nfrags = init_frag_list(fragments_list, current_frag_list) new_abs_fragments_contacts_file = current_abs_fragments_contacts new_contig_list_file = current_contig_info new_fragments_list_file = current_frag_list sub_2_super_frag_index_file = os.path.join( pyramid_level_folder, level_pyramid + "sub_2_super_index_frag.txt", ) else: new_contig_list_file = os.path.join( pyramid_level_folder, level_pyramid + "contig_info.txt" ) new_fragments_list_file = os.path.join( pyramid_level_folder, level_pyramid + "fragments_list.txt" ) new_abs_fragments_contacts_file = os.path.join( pyramid_level_folder, level_pyramid + "abs_frag_contacts.txt" ) if ( os.path.exists(new_contig_list_file) and os.path.exists(new_fragments_list_file) and os.path.exists(new_abs_fragments_contacts_file) and os.path.exists(sub_2_super_frag_index_file) ): logger.info("level already built...") nfrags = file_len(new_fragments_list_file) - 1 else: logger.info("writing new_files..") nfrags = subsample_data_set( current_contig_info, current_frag_list, fact_sub_sampling, current_abs_fragments_contacts, new_abs_fragments_contacts_file, min_bin_per_contig, new_contig_list_file, new_fragments_list_file, sub_2_super_frag_index_file, ) try: status = pyramid_handle.attrs[str(level)] == "done" except KeyError: pyramid_handle.attrs[str(level)] = "pending" status = False if not (status): logger.info("Start filling the pyramid") fill_sparse_pyramid_level( pyramid_handle, level, new_abs_fragments_contacts_file, nfrags ) pyramid_handle.attrs[str(level)] = "done" current_frag_list = new_fragments_list_file current_contig_info = new_contig_list_file current_abs_fragments_contacts = new_abs_fragments_contacts_file sub_2_super_frag_index_file = os.path.join( pyramid_level_folder, level_pyramid + "sub_2_super_index_frag.txt" ) logger.info("pyramid built.") pyramid_handle.close()
[docs]def abs_contact_2_coo_file(abs_contact_file, coo_file): """Convert contact maps between old-style and new-style formats. A legacy function that converts contact maps from the older GRAAL format to the simpler instaGRAAL format. This is useful with datasets generated by Hi-C box. Parameters ---------- abs_contact_file : str, file or pathlib.Path The input old-style contact map. coo_file : str, file, or pathlib.Path The output path to the generated contact map; must be writable. """ sparse_dict = dict() h = open(abs_contact_file, "r") all_lines = h.readlines() n_lines = len(all_lines) for i in range(1, n_lines): line = all_lines[i] dat = line.split() mates = [int(dat[0]), int(dat[1])] mates.sort() f1 = mates[0] - 1 f2 = mates[1] - 1 if f1 in sparse_dict: if f2 in sparse_dict[f1]: sparse_dict[f1][f2] += 1 else: sparse_dict[f1][f2] = 1 else: sparse_dict[f1] = dict() sparse_dict[f1][f2] = 1 keys = list(sparse_dict.keys()) keys.sort() h.close() h_coo = open(coo_file, "w") h_coo.write("%s\t%s\t%s\n" % ("id_frag_a", "id_frag_b", "n_contact")) for fa in keys: d_fb = sparse_dict[fa] keys_b = list(d_fb.keys()) keys_b.sort() for fb in keys_b: nc = d_fb[fb] h_coo.write("%s\t%s\t%s\n" % (str(fa), str(fb), str(nc))) h_coo.close() h.close()
[docs]def fill_sparse_pyramid_level(pyramid_handle, level, contact_file, nfrags): """Fill a level with sparse contact map data Fill values from the simple text matrix file to the hdf5-based pyramid level with contact data. Parameters ---------- pyramid_handle : h5py.File The hdf5 file handle containing the whole dataset. level : int The level (resolution) to be filled with contact data. contact_file : str, file or pathlib.Path The binned contact map file to be converted to hdf5 data. nfrags : int The number of fragments/bins in that specific level. """ sparse_dict = dict() h = open(contact_file, "r") all_lines = h.readlines() n_lines = len(all_lines) for i in range(1, n_lines): line = all_lines[i] dat = line.split() mates = [int(dat[0]), int(dat[1])] nc = int(dat[2]) mates.sort() f1 = mates[0] f2 = mates[1] if f1 in sparse_dict: if f2 in sparse_dict[f1]: sparse_dict[f1][f2] += nc else: sparse_dict[f1][f2] = nc else: sparse_dict[f1] = dict() sparse_dict[f1][f2] = nc keys = list(sparse_dict.keys()) keys.sort() out_r = [] out_c = [] out_d = [] for r in keys: data = sparse_dict[r] for c in list(data.keys()): out_r.append(r) out_c.append(c) out_d.append(data[c]) n_on_pxls = len(out_d) level_hdf5 = pyramid_handle.create_group(str(level)) data_2_sparse = level_hdf5.create_dataset("data", (3, n_on_pxls), "i") data_nfrags = level_hdf5.create_dataset("nfrags", (1, 1), "i") np_csr = np.zeros((3, n_on_pxls), dtype=np.int32) np_csr[0, :] = out_r np_csr[1, :] = out_c np_csr[2, :] = out_d data_2_sparse[0, :] = out_r data_2_sparse[1, :] = out_c data_2_sparse[2, :] = out_d data_nfrags[:] = nfrags
[docs]def init_frag_list(fragment_list, new_frag_list): """Adapt the original fragment list to fit the build function requirements Parameters ---------- fragment_list : str, file or pathlib.Path The input fragment list. new_frag_list : str, file or pathlib.Path The output fragment list to be written. Returns ------- i : int The number of records processed this way. """ handle_frag_list = open(fragment_list, "r") handle_new_frag_list = open(new_frag_list, "w") handle_new_frag_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "id", "chrom", "start_pos", "end_pos", "size", "gc_content", "accu_frag", "frag_start", "frag_end", ) ) handle_frag_list.readline() i = 0 while 1: line_frag = handle_frag_list.readline() if not line_frag: handle_frag_list.close() handle_new_frag_list.close() break i += 1 data = line_frag.split("\t") id_init = data[0] contig_name = data[1] start_pos = data[2] end_pos = data[3] length_kb = data[4] gc_content = str(float(data[5])) accu_frag = str(1) frag_start = id_init frag_end = id_init handle_new_frag_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( id_init, contig_name, start_pos, end_pos, length_kb, gc_content, accu_frag, frag_start, frag_end, ) ) return i
[docs]def subsample_data_set( contig_info, fragments_list, fact_sub_sample, abs_fragments_contacts, new_abs_fragments_contacts_file, min_bin_per_contig, new_contig_list_file, new_fragments_list_file, old_2_new_file, ): logger.debug("fact sub sampling = {}".format(fact_sub_sample)) logger.debug( "minimum bin numer per contig = {}".format(min_bin_per_contig) ) if fact_sub_sample <= 1: logger.info("subsampling : nothing to do") shutil.copy(fragments_list, new_fragments_list_file) shutil.copy(contig_info, new_contig_list_file) shutil.copy(abs_fragments_contacts, new_abs_fragments_contacts_file) nfrags = file_len(fragments_list) - 1 handle_old_2_new = open(old_2_new_file, "w") handle_old_2_new.write("%s\t%s\n" % ("current_id", "super_id")) for ind in range(0, nfrags): curr_id = str(ind + 1) super_id = curr_id handle_old_2_new.write("%s\t%s\n" % (curr_id, super_id)) handle_old_2_new.close() else: logger.info("subsampling : start") old_2_new_frags = dict() spec_new_frags = dict() handle_new_contigs_list = open(new_contig_list_file, "w") handle_new_contigs_list.write( "%s\t%s\t%s\t%s\n" % ("contig", "length_kb", "n_frags", "cumul_length") ) new_abs_id_frag = 0 id_frag_abs = 0 # reading contig info !!!! ####################################### handle_contig_info = open(contig_info, "r") handle_contig_info.readline() sum_length_contigs = 0 while 1: line_contig = handle_contig_info.readline() if not line_contig: handle_contig_info.close() handle_new_contigs_list.close() break data = line_contig.split("\t") init_contig = data[0] # id_frag_start = 1 id_frag_end = int(data[2]) length_kb = data[1] # orientation = "w" condition_sub_sample = ( id_frag_end / np.float32(fact_sub_sample) ) >= min_bin_per_contig and not (fact_sub_sample == 1) accu_frag = 0 new_rel_id_frag = 0 id_frag_rel = 0 sum_length_contigs += id_frag_end if condition_sub_sample: for arbind in range(0, id_frag_end): # for id_frag_rel in range(1,id_frag_end+1 ): id_frag_rel += 1 id_frag_abs += 1 if id_frag_rel % fact_sub_sample == 1: accu_frag = 0 new_abs_id_frag += 1 new_rel_id_frag += 1 spec_new_frags[new_abs_id_frag] = dict() spec_new_frags[new_abs_id_frag][ "frag_start" ] = id_frag_abs spec_new_frags[new_abs_id_frag][ "frag_end" ] = id_frag_abs accu_frag += 1 old_2_new_frags[id_frag_abs] = new_abs_id_frag spec_new_frags[new_abs_id_frag]["accu_frag"] = accu_frag spec_new_frags[new_abs_id_frag]["id_rel"] = new_rel_id_frag spec_new_frags[new_abs_id_frag][ "init_contig" ] = init_contig spec_new_frags[new_abs_id_frag]["gc_content"] = [] spec_new_frags[new_abs_id_frag]["size"] = [] spec_new_frags[new_abs_id_frag]["frag_end"] = id_frag_abs else: for arbind in range(0, id_frag_end): id_frag_abs += 1 new_abs_id_frag += 1 new_rel_id_frag += 1 id_frag_rel += 1 old_2_new_frags[id_frag_abs] = new_abs_id_frag spec_new_frags[new_abs_id_frag] = { "frag_start": id_frag_abs, "frag_end": id_frag_abs, "accu_frag": 1, "init_contig": init_contig, "gc_content": [], "size": [], "id_rel": new_rel_id_frag, } handle_new_contigs_list.write( "%s\t%s\t%s\t%s\n" % ( init_contig, length_kb, new_rel_id_frag, new_abs_id_frag - new_rel_id_frag, ) ) # write new fragments list logger.info("size matrix before sub sampling = {}".format(id_frag_abs)) logger.info( "size matrix after sub sampling = {}".format(new_abs_id_frag) ) logger.info("sum length contigs = {}".format(sum_length_contigs)) # reading fragments list !!!! ####################################### handle_fragments_list = open(fragments_list, "r") handle_fragments_list.readline() id_abs = 0 while 1: line_fragments = handle_fragments_list.readline() if not line_fragments: handle_fragments_list.close() break id_abs += 1 # print id_abs data = line_fragments.split("\t") id_init = int(data[0]) # contig_name = data[1] start_pos = int(data[2]) end_pos = int(data[3]) length_kb = int(data[4]) gc_content = float(data[5]) # np_id_abs = id_abs curr_id = id_init init_frag_start = int(data[7]) init_frag_end = int(data[8]) id_new = old_2_new_frags[id_abs] spec_new_frags[id_new]["gc_content"].append(gc_content) # spec_new_frags[id_new]['size'].append(length_kb) if id_abs == spec_new_frags[id_new]["frag_start"]: spec_new_frags[id_new]["start_pos"] = start_pos spec_new_frags[id_new][ "init_frag_start" ] = init_frag_start # coord level 0 if id_abs == spec_new_frags[id_new]["frag_end"]: spec_new_frags[id_new]["end_pos"] = end_pos spec_new_frags[id_new]["size"] = ( end_pos - spec_new_frags[id_new]["start_pos"] ) spec_new_frags[id_new][ "init_frag_end" ] = init_frag_end # coord level 0 keys_new_frags = list(spec_new_frags.keys()) keys_new_frags.sort() handle_new_fragments_list = open(new_fragments_list_file, "w") handle_new_fragments_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "id", "chrom", "start_pos", "end_pos", "size", "gc_content", "accu_frag", "init_frag_start", "init_frag_end", "sub_frag_start", "sub_frag_end", ) ) # print "id problem",spec_new_frags[1] nfrags = len(keys_new_frags) logger.info("nfrags = {}".format(nfrags)) for new_frag in keys_new_frags: id = str(spec_new_frags[new_frag]["id_rel"]) gc_content = np.array( spec_new_frags[new_frag]["gc_content"] ).mean() size = spec_new_frags[new_frag]["size"] start_pos = spec_new_frags[new_frag]["start_pos"] end_pos = spec_new_frags[new_frag]["end_pos"] chrom = spec_new_frags[new_frag]["init_contig"] init_frag_start = spec_new_frags[new_frag]["init_frag_start"] init_frag_end = spec_new_frags[new_frag]["init_frag_end"] sub_frag_start = spec_new_frags[new_frag]["frag_start"] sub_frag_end = spec_new_frags[new_frag]["frag_end"] accu_frag = str(int(init_frag_end) - int(init_frag_start) + 1) ########################## handle_new_fragments_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( id, chrom, start_pos, end_pos, size, gc_content, accu_frag, init_frag_start, init_frag_end, sub_frag_start, sub_frag_end, ) ) handle_new_fragments_list.close() logger.info("new fragments list written...") # be carefull : le dictionnaire est base sur if not (abs_fragments_contacts == "SIMU"): logger.info("update sparse contacts file...") # index 0 based handle_new_abs_fragments_contacts = open( new_abs_fragments_contacts_file, "w" ) handle_abs_fragments_contacts = open(abs_fragments_contacts, "r") handle_new_abs_fragments_contacts.write( "%s\t%s\t%s\n" % ("id_frag_a", "id_frag_b", "n_contact") ) handle_abs_fragments_contacts.readline() all_lines = handle_abs_fragments_contacts.readlines() sparse_dict = dict() for id_line in range(1, len(all_lines)): line = all_lines[id_line] dat = line.split("\t") fa = int(dat[0]) fb = int(dat[1]) nc = int(dat[2]) new_abs_id_frag_a = old_2_new_frags[ fa + 1 ] # old_2_new_frags 1-based index new_abs_id_frag_b = old_2_new_frags[ fb + 1 ] # old_2_new_frags 1-based index mates = [new_abs_id_frag_a - 1, new_abs_id_frag_b - 1] mates.sort() f1 = mates[0] f2 = mates[1] if f1 in sparse_dict: if f2 in sparse_dict[f1]: sparse_dict[f1][f2] += nc else: sparse_dict[f1][f2] = nc else: sparse_dict[f1] = dict() sparse_dict[f1][f2] = nc keys = list(sparse_dict.keys()) keys.sort() for fa in keys: d_fb = sparse_dict[fa] keys_b = list(d_fb.keys()) keys_b.sort() for fb in keys_b: nc = d_fb[fb] handle_new_abs_fragments_contacts.write( "%s\t%s\t%s\n" % (str(fa), str(fb), str(nc)) ) handle_new_abs_fragments_contacts.close() logger.info("subsampling: done.") handle_old_2_new = open(old_2_new_file, "w") handle_old_2_new.write("%s\t%s\n" % ("current_id", "super_id")) for ind in list(old_2_new_frags.keys()): curr_id = str(ind) super_id = str(old_2_new_frags[ind]) handle_old_2_new.write("%s\t%s\n" % (curr_id, super_id)) handle_old_2_new.close() return nfrags
# PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM # PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM #####
[docs]def remove_problematic_fragments( contig_info, fragments_list, abs_fragments_contacts, new_contig_list_file, new_fragments_list_file, new_abs_fragments_contacts_file, pyramid, thresh_factor=1, ): import numpy as np # full_resolution = pyramid["0"] level = pyramid["0"] np_2_scipy_sparse = level["data"] nfrags = int(level["nfrags"][0]) logger.info("nfrags = {}".format(nfrags)) sparse_mat_csr = sp.csr_matrix( (np_2_scipy_sparse[2, :], np_2_scipy_sparse[0:2, :]), shape=(nfrags, nfrags), ) full_mat = sparse_mat_csr + sparse_mat_csr.transpose() collect_sparsity = np.float32(np.diff(full_mat.indptr)) / np.float32( nfrags ) mean_spars = collect_sparsity.mean() std_spars = collect_sparsity.std() max_spars = collect_sparsity.max() logger.info("n init frags = {}".format(nfrags)) logger.info("mean sparsity = {}".format(mean_spars)) logger.info("median sparsity = {}".format(np.median(collect_sparsity))) logger.info("std sparsity = {}".format(std_spars)) logger.info("max_sparsity = {}".format(max_spars)) plt.figure() plt.scatter(list(range(len(collect_sparsity))), collect_sparsity) plt.savefig("sparsity_plot.pdf") plt.close() plt.figure() plt.hist(collect_sparsity, 100) plt.savefig("sparsity_hist.pdf") # SENSITIVE PARAMETER ######## # print(collect_sparsity) # thresh = np.percentile(collect_sparsity,25) # thresh = max_spars + std_spars thresh_max = mean_spars + 50 * std_spars # thresh_max = np.percentile(collect_sparsity,np.float64(99.9)) # thresh = mean_spars - 1.01 * std_spars # para g1 thresh = mean_spars - thresh_factor * std_spars # thresh = 0.01 *mean_spars # for human data chr14 4 and de novo chr14 # thresh = 0.0015 # for human data chr14 4 and de novo chr14 # thresh = 0.001 *mean_spars # dde novo s1 # thresh = max(mean_spars - 0.9 * std_spars,0) # para g1 # SENSITIVE PARAMETER ######## list_fragments_problem = list(np.nonzero(collect_sparsity == 0)[0]) logger.info("thresh sparsity = {}".format(thresh)) # plt.show() list_fragments_problem = list(np.nonzero(collect_sparsity <= thresh)[0]) list_fragments_problem_too_large = list( np.nonzero(collect_sparsity > thresh_max)[0] ) list_fragments_problem.extend(list_fragments_problem_too_large) # print(np.array(list_fragments_problem)) logger.info("cleaning : start") import numpy as np handle_new_fragments_list = open(new_fragments_list_file, "w") handle_new_fragments_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "id", "chrom", "start_pos", "end_pos", "size", "gc_content", "accu_frag", "frag_start", "frag_end", ) ) # build np_id_2_frag dictionary np_id_2_frag = get_frag_info_from_fil(fragments_list) # remove fragments which are too small # list_short_frags = [] thresh_size = 50 for np_index in np_id_2_frag: tmp_frag = np_id_2_frag[np_index] if tmp_frag["length"] < thresh_size: list_short_frags.append(np_index) list_fragments_problem = list(list_fragments_problem) list_fragments_problem.extend(list_short_frags) list_fragments_problem = np.unique(list_fragments_problem) logger.info( "number of fragments to remove = {}".format( len(list_fragments_problem) ) ) # build Contigs.init_contigs dictionary ( need to know the number of frags # per contig) init_contigs, list_init_contig = get_contig_info_from_file(contig_info) prob_frag = dict() for np_index in list_fragments_problem: tmp_frag = np_id_2_frag[np_index] id = tmp_frag["index"] + "-" + tmp_frag["init_contig"] prob_frag[id] = { "init_contig": tmp_frag["init_contig"], "index": tmp_frag["index"], } # print prob_frag contig_info_dict = dict() # list_init_contig = init_contigs.keys() # list_init_contig.sort() for chrom in list_init_contig: contig_info_dict[chrom] = { "n_frags": init_contigs[chrom]["n_frags"], "n_new_frags": 0, "length_kb": 0, } new_id_frag_rel = 0 new_id_frag_abs = 1 init_id_frag_abs = 0 old_2_new_frags = dict() handle_fragments_list = open(fragments_list, "r") handle_fragments_list.readline() spec_new_frags = dict() tmp_cumul = { "start_pos": 0, "end_pos": 0, "chrom": 0, "size": 0, "accu_frag": 0, "gc_content": [], "lock": False, "init_id_frags": [], "list_chrom": [], } ###################################### step = 0 while 1: line_fragment = handle_fragments_list.readline() if not line_fragment: if tmp_cumul["lock"]: for ele in tmp_cumul["init_id_frags"]: old_2_new_frags[ele] = "destroyed" new_id_frag_abs -= 1 handle_fragments_list.close() handle_new_fragments_list.close() break step += 1 init_id_frag_abs += 1 data = line_fragment.split("\t") id = int(data[0]) if id == 1: new_id_frag_rel = 1 if not tmp_cumul["lock"]: new_id_frag_abs += 0 else: for ele in tmp_cumul["init_id_frags"]: old_2_new_frags[ele] = "destroyed" tmp_cumul["gc_content"] = [] tmp_cumul["start_pos"] = 0 tmp_cumul["init_id_frags"] = [] tmp_cumul["list_chrom"] = [] tmp_cumul["frag_start"] = [] tmp_cumul["frag_end"] = [] tmp_cumul["size"] = 0 # debug chrom = data[1] start_pos = data[2] end_pos = data[3] size = int(data[4]) gc_content = float(data[5]) accu_frag = int(data[6]) frag_start = int(data[7]) frag_end = int(data[8]) name_frag = str(id) + "-" + chrom lock = name_frag in prob_frag tmp_cumul["chrom"] = chrom tmp_cumul["list_chrom"].append(chrom) tmp_cumul["end_pos"] = end_pos tmp_cumul["size"] += size tmp_cumul["accu_frag"] += accu_frag tmp_cumul["lock"] = name_frag in prob_frag if size <= 1: tmp_cumul["lock"] = True tmp_cumul["frag_start"].append(frag_start) tmp_cumul["frag_end"].append(frag_end) tmp_cumul["gc_content"].append(gc_content) tmp_cumul["init_id_frags"].append(init_id_frag_abs) old_2_new_frags[init_id_frag_abs] = new_id_frag_abs if not lock: for ele in tmp_cumul["list_chrom"]: if not (ele == tmp_cumul["list_chrom"][0]): logger.info( "warning problem hetero fragments!!!!!!!!!!!!!!!" ) contig_info_dict[chrom]["n_new_frags"] += 1 contig_info_dict[chrom]["length_kb"] += tmp_cumul["size"] # str_frag_start = str(min(tmp_cumul["frag_start"])) # str_frag_end = str(max(tmp_cumul["frag_end"])) str_frag_start = str(new_id_frag_rel) str_frag_end = str(new_id_frag_rel) spec_new_frags[new_id_frag_abs] = { "accu_frag": accu_frag, "gc_content": tmp_cumul["gc_content"], "chrom": chrom, } handle_new_fragments_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( str(new_id_frag_rel), chrom, tmp_cumul["start_pos"], tmp_cumul["end_pos"], str(tmp_cumul["size"]), str(np.array(tmp_cumul["gc_content"]).mean()), tmp_cumul["accu_frag"], str_frag_start, str_frag_end, ) ) tmp_cumul["start_pos"] = end_pos tmp_cumul["end_pos"] = start_pos tmp_cumul["size"] = 0 tmp_cumul["accu_frag"] = 0 tmp_cumul["lock"] = name_frag in prob_frag tmp_cumul["chrom"] = chrom tmp_cumul["gc_content"] = [] tmp_cumul["init_id_frags"] = [] tmp_cumul["list_chrom"] = [] tmp_cumul["frag_start"] = [] tmp_cumul["frag_end"] = [] new_id_frag_rel += 1 new_id_frag_abs += 1 logger.debug("max new id = {}".format(new_id_frag_abs)) handle_new_contigs_list = open(new_contig_list_file, "w") handle_new_contigs_list.write( "%s\t%s\t%s\t%s\n" % ("contig", "length_kb", "n_frags", "cumul_length") ) handle_contig_info = open(contig_info, "r") handle_contig_info.readline() cumul_length = 0 while 1: line_contig = handle_contig_info.readline() if not line_contig: handle_contig_info.close() handle_new_contigs_list.close() break data = line_contig.split("\t") contig = data[0] # length_kb = data[1] length_kb = contig_info_dict[contig]["length_kb"] n_frags = contig_info_dict[contig]["n_new_frags"] if n_frags > 0: handle_new_contigs_list.write( "%s\t%s\t%s\t%s\n" % (contig, str(length_kb), str(n_frags), str(cumul_length)) ) cumul_length += n_frags else: logger.info(contig + " has been deleted...") logger.info("update contacts files...") # write new contacts file # n_total_contacts = file_len(abs_fragments_contacts) handle_new_abs_fragments_contacts = open( new_abs_fragments_contacts_file, "w" ) handle_abs_fragments_contacts = open(abs_fragments_contacts, "r") handle_new_abs_fragments_contacts.write( "%s\t%s\t%s\n" % ("id_frag_a", "id_frag_b", "n_contact") ) all_lines = handle_abs_fragments_contacts.readlines() # for id_line_contacts in xrange(1, len(all_lines_contact)): # line_contacts = all_lines_contact[id_line_contacts] # data = line_contacts.split() # w_size = data[2] # w_gc = data[3] # abs_id_frag_a = int(data[0]) # abs_id_frag_b = int(data[1]) # new_abs_id_frag_a = old_2_new_frags[abs_id_frag_a] # new_abs_id_frag_b = old_2_new_frags[abs_id_frag_b] # # step += 1 # # pt = np.int32(step)/n_total_contacts # p.render(pt * 100, 'step %s\nProcessing...\nDescription: updating # contacts file.' % step) if not(new_abs_id_frag_a == 'destroyed' or # new_abs_id_frag_b == 'destroyed'): # w_sub_sample = (spec_new_frags[new_abs_id_frag_a]['accu_frag']* # spec_new_frags[new_abs_id_frag_b]['accu_frag']) # handle_new_abs_fragments_contacts.write("%s\t%s\t%s\t%s\t%s\n" # %(str(new_abs_id_frag_a),str(new_abs_id_frag_b), # w_size, w_gc, str(w_sub_sample))) sparse_dict = dict() for id_line in range(1, len(all_lines)): line = all_lines[id_line] dat = line.split("\t") fa = int(dat[0]) fb = int(dat[1]) nc = int(dat[2]) new_abs_id_frag_a = old_2_new_frags[ fa + 1 ] # old_2_new_frags 1-based index new_abs_id_frag_b = old_2_new_frags[ fb + 1 ] # old_2_new_frags 1-based index if not ( new_abs_id_frag_a == "destroyed" or new_abs_id_frag_b == "destroyed" ): mates = [new_abs_id_frag_a - 1, new_abs_id_frag_b - 1] mates.sort() f1 = mates[0] f2 = mates[1] if f1 in sparse_dict: if f2 in sparse_dict[f1]: sparse_dict[f1][f2] += nc else: sparse_dict[f1][f2] = nc else: sparse_dict[f1] = dict() sparse_dict[f1][f2] = nc keys = list(sparse_dict.keys()) keys.sort() for fa in keys: d_fb = sparse_dict[fa] keys_b = list(d_fb.keys()) keys_b.sort() for fb in keys_b: nc = d_fb[fb] handle_new_abs_fragments_contacts.write( "%s\t%s\t%s\n" % (str(fa), str(fb), str(nc)) ) handle_new_abs_fragments_contacts.close() return thresh
# PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM # PROBLEM PROBLEM PROBLEM PROBLEM PROBLEM #####
[docs]def get_contig_info_from_file(contig_info): handle_contig_info = open(contig_info, "r") handle_contig_info.readline() init_contigs = dict() list_contigs = [] while 1: line_contig = handle_contig_info.readline() if not line_contig: handle_contig_info.close() break data = line_contig.split("\t") chr = data[0] list_contigs.append(chr) length_kb = int(data[1]) n_frags = int(data[2]) cumul_length = int(data[3]) init_contigs[chr] = dict() init_contigs[chr]["n_frags"] = n_frags init_contigs[chr]["length_kb"] = length_kb init_contigs[chr]["cumul_length"] = cumul_length return init_contigs, list_contigs
[docs]def get_frag_info_from_fil(fragments_list): handle_list_fragments = open(fragments_list, "r") handle_list_fragments.readline() fragments_info = dict() id = 0 while 1: line_contig = handle_list_fragments.readline() if not line_contig: handle_list_fragments.close() break data = line_contig.split("\t") fragments_info[id] = dict() start = int(data[2]) end = int(data[3]) fragments_info[id]["init_contig"] = data[1] fragments_info[id]["index"] = data[0] fragments_info[id]["length"] = end - start id += 1 return fragments_info
[docs]def new_remove_problematic_fragments( contig_info, fragments_list, abs_fragments_contacts, new_contig_list_file, new_fragments_list_file, new_abs_fragments_contacts_file, pyramid, ): import numpy as np # full_resolution = pyramid["0"] level = pyramid["0"] np_2_scipy_sparse = level["data"] nfrags = level["nfrags"][0] logger.debug("nfrags = {}".format(nfrags)) sparse_mat_csr = sp.csr_matrix( (np_2_scipy_sparse[2, :], np_2_scipy_sparse[0:2, :]), shape=(nfrags, nfrags), ) full_mat = sparse_mat_csr + sparse_mat_csr.transpose() collect_sparsity = np.float32(np.diff(full_mat.indptr)) / np.float32( nfrags ) mean_spars = collect_sparsity.mean() std_spars = collect_sparsity.std() max_spars = collect_sparsity.max() logger.debug("n init frags = {}".format(nfrags)) logger.debug("mean sparsity = {}".format(mean_spars)) logger.debug("std sparsity = {}".format(std_spars)) logger.debug("max_sparsity = {}".format(max_spars)) thresh = mean_spars - 1.01 * std_spars # para g1 thresh = mean_spars * 0.8 # for de novo chr14 thresh = 0.0009 # for de novo chr14 logger.debug("thresh sparsity = {}".format(thresh)) plt.hist(collect_sparsity, 500) plt.show() list_fragments_problem = np.nonzero(collect_sparsity <= thresh)[0] logger.info( "number of fragments to remove = {}".format( len(list_fragments_problem) ) ) handle_new_fragments_list = open(new_fragments_list_file, "w") handle_new_fragments_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "id", "chrom", "start_pos", "end_pos", "size", "gc_content", "accu_frag", "frag_start", "frag_end", ) ) # build np_id_2_frag dictionary np_id_2_frag = get_frag_info_from_fil(fragments_list) # build Contigs.init_contigs dictionary ( need to know the number of frags # per contig) init_contigs, list_init_contig = get_contig_info_from_file(contig_info) prob_frag = dict() for np_index in list_fragments_problem: tmp_frag = np_id_2_frag[np_index] id = tmp_frag["index"] + "-" + tmp_frag["init_contig"] prob_frag[id] = { "init_contig": tmp_frag["init_contig"], "index": tmp_frag["index"], } # print prob_frag contig_info_dict = dict() # list_init_contig = init_contigs.keys() # list_init_contig.sort() for chrom in list_init_contig: contig_info_dict[chrom] = { "n_frags": init_contigs[chrom]["n_frags"], "n_new_frags": 0, "length_kb": 0, } new_id_frag_rel = 0 new_id_frag_abs = 1 init_id_frag_abs = 0 old_2_new_frags = dict() handle_fragments_list = open(fragments_list, "r") handle_fragments_list.readline() spec_new_frags = dict() tmp_cumul = { "start_pos": 0, "end_pos": 0, "chrom": 0, "size": 0, "accu_frag": 0, "gc_content": [], "lock": False, "init_id_frags": [], "list_chrom": [], } step = 0 while 1: line_fragment = handle_fragments_list.readline() if not line_fragment: if tmp_cumul["lock"]: for ele in tmp_cumul["init_id_frags"]: old_2_new_frags[ele] = "destroyed" new_id_frag_abs -= 1 handle_fragments_list.close() handle_new_fragments_list.close() break step += 1 init_id_frag_abs += 1 data = line_fragment.split("\t") id = int(data[0]) if id == 1: new_id_frag_rel = 1 if not tmp_cumul["lock"]: new_id_frag_abs += 0 else: for ele in tmp_cumul["init_id_frags"]: old_2_new_frags[ele] = "destroyed" tmp_cumul["gc_content"] = [] tmp_cumul["start_pos"] = 0 tmp_cumul["init_id_frags"] = [] tmp_cumul["list_chrom"] = [] tmp_cumul["frag_start"] = [] tmp_cumul["frag_end"] = [] tmp_cumul["size"] = 0 # debug chrom = data[1] start_pos = data[2] end_pos = data[3] size = int(data[4]) gc_content = float(data[5]) accu_frag = int(data[6]) frag_start = int(data[7]) frag_end = int(data[8]) name_frag = str(id) + "-" + chrom lock = name_frag in prob_frag tmp_cumul["chrom"] = chrom tmp_cumul["list_chrom"].append(chrom) tmp_cumul["end_pos"] = end_pos tmp_cumul["size"] += size tmp_cumul["accu_frag"] += accu_frag tmp_cumul["lock"] = name_frag in prob_frag if size <= 1: tmp_cumul["lock"] = True tmp_cumul["frag_start"].append(frag_start) tmp_cumul["frag_end"].append(frag_end) tmp_cumul["gc_content"].append(gc_content) tmp_cumul["init_id_frags"].append(init_id_frag_abs) old_2_new_frags[init_id_frag_abs] = new_id_frag_abs if not lock: for ele in tmp_cumul["list_chrom"]: if not (ele == tmp_cumul["list_chrom"][0]): logger.info( "warning problem hetero fragments!!!!!!!!!!!!!!!" ) contig_info_dict[chrom]["n_new_frags"] += 1 contig_info_dict[chrom]["length_kb"] += tmp_cumul["size"] # str_frag_start = str(min(tmp_cumul["frag_start"])) # str_frag_end = str(max(tmp_cumul["frag_end"])) str_frag_start = str(new_id_frag_rel) str_frag_end = str(new_id_frag_rel) spec_new_frags[new_id_frag_abs] = { "accu_frag": accu_frag, "gc_content": tmp_cumul["gc_content"], "chrom": chrom, } handle_new_fragments_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( str(new_id_frag_rel), chrom, tmp_cumul["start_pos"], tmp_cumul["end_pos"], str(tmp_cumul["size"]), str(np.array(tmp_cumul["gc_content"]).mean()), tmp_cumul["accu_frag"], str_frag_start, str_frag_end, ) ) tmp_cumul["start_pos"] = end_pos tmp_cumul["end_pos"] = start_pos tmp_cumul["size"] = 0 tmp_cumul["accu_frag"] = 0 tmp_cumul["lock"] = name_frag in prob_frag tmp_cumul["chrom"] = chrom tmp_cumul["gc_content"] = [] tmp_cumul["init_id_frags"] = [] tmp_cumul["list_chrom"] = [] tmp_cumul["frag_start"] = [] tmp_cumul["frag_end"] = [] new_id_frag_rel += 1 new_id_frag_abs += 1 # Update contig info file logger.debug("max new id = {}".format(new_id_frag_abs)) handle_new_contigs_list = open(new_contig_list_file, "w") handle_new_contigs_list.write( "%s\t%s\t%s\t%s\n" % ("contig", "length_kb", "n_frags", "cumul_length") ) handle_contig_info = open(contig_info, "r") handle_contig_info.readline() cumul_length = 0 while 1: line_contig = handle_contig_info.readline() if not line_contig: handle_contig_info.close() handle_new_contigs_list.close() break data = line_contig.split("\t") contig = data[0] # length_kb = data[1] length_kb = contig_info_dict[contig]["length_kb"] n_frags = contig_info_dict[contig]["n_new_frags"] if n_frags > 0: handle_new_contigs_list.write( "%s\t%s\t%s\t%s\n" % (contig, str(length_kb), str(n_frags), str(cumul_length)) ) cumul_length += n_frags else: logger.info(contig + " has been deleted...") logger.info("update contacts files...") handle_new_abs_fragments_contacts = open( new_abs_fragments_contacts_file, "w" ) handle_abs_fragments_contacts = open(abs_fragments_contacts, "r") handle_new_abs_fragments_contacts.write( "%s\t%s\t%s\n" % ("id_frag_a", "id_frag_b", "n_contact") ) all_lines = handle_abs_fragments_contacts.readlines() sparse_dict = dict() for id_line in range(1, len(all_lines)): line = all_lines[id_line] dat = line.split("\t") fa = int(dat[0]) fb = int(dat[1]) nc = int(dat[2]) new_abs_id_frag_a = old_2_new_frags[ fa + 1 ] # old_2_new_frags 1-based index new_abs_id_frag_b = old_2_new_frags[ fb + 1 ] # old_2_new_frags 1-based index if not ( new_abs_id_frag_a == "destroyed" or new_abs_id_frag_b == "destroyed" ): mates = [new_abs_id_frag_a - 1, new_abs_id_frag_b - 1] mates.sort() f1 = mates[0] f2 = mates[1] if f1 in sparse_dict: if f2 in sparse_dict[f1]: sparse_dict[f1][f2] += nc else: sparse_dict[f1][f2] = nc else: sparse_dict[f1] = dict() sparse_dict[f1][f2] = nc keys = list(sparse_dict.keys()) keys.sort() for fa in keys: d_fb = sparse_dict[fa] keys_b = list(d_fb.keys()) keys_b.sort() for fb in keys_b: nc = d_fb[fb] handle_new_abs_fragments_contacts.write( "%s\t%s\t%s\n" % (str(fa), str(fb), str(nc)) ) handle_new_abs_fragments_contacts.close()
[docs]class pyramid: def __init__(self, pyramid_folder, n_levels): logger.info("init pyramid") self.pyramid_folder = pyramid_folder self.n_levels = n_levels pyramid_file = "pyramid.hdf5" self.pyramid_file = os.path.join(pyramid_folder, pyramid_file) self.data = h5py.File(self.pyramid_file) self.spec_level = dict() # self.default_level = default_level self.struct_initiated = False self.resol_F_s_kb = 3 # size bin in kb self.dist_max_kb = 30 * 2 * self.resol_F_s_kb # length histo in kb # self.resol_F_s_kb = 10 # size bin in kb # self.dist_max_kb = 10 * 2 * self.resol_F_s_kb # length histo in kb for i in range(0, n_levels): level_folder = os.path.join(pyramid_folder, "level_" + str(i)) find_super_index = i < n_levels - 1 self.spec_level[str(i)] = dict() self.spec_level[str(i)]["level_folder"] = level_folder self.spec_level[str(i)]["fragments_list_file"] = os.path.join( level_folder, str(i) + "_fragments_list.txt" ) self.spec_level[str(i)]["contig_info_file"] = os.path.join( level_folder, str(i) + "_contig_info.txt" ) ( frag_dictionary, contig_dictionary, list_contigs, list_contigs_id, ) = self.build_frag_dictionnary( self.spec_level[str(i)]["fragments_list_file"], i ) if i == 0: self.list_contigs_name = list_contigs self.list_contigs_id = list_contigs_id self.spec_level[str(i)]["fragments_dict"] = frag_dictionary self.spec_level[str(i)]["contigs_dict"] = contig_dictionary if find_super_index: # print "update super index" super_index_file = os.path.join( level_folder, str(i) + "_sub_2_super_index_frag.txt" ) self.update_super_index( self.spec_level[str(i)]["fragments_dict"], super_index_file ) self.update_super_index_in_dict_contig( self.spec_level[str(i)]["fragments_dict"], self.spec_level[str(i)]["contigs_dict"], ) else: for contig_id in list( self.spec_level[str(i)]["contigs_dict"].keys() ): try: int(contig_id) except ValueError: self.spec_level[str(i)]["contigs_dict"].pop(contig_id)
[docs] def close(self): self.data.close()
[docs] def get_level(self, level_id): lev = level(self, level_id) return lev
[docs] def build_frag_dictionnary(self, fragments_list, level): handle_list_fragments = open(fragments_list, "r") handle_list_fragments.readline() fragments_info = dict() id = 1 contig_dict = dict() id_contig = 0 list_contigs = [] list_contigs_id = [] while 1: line_contig = handle_list_fragments.readline() if not line_contig: handle_list_fragments.close() break data = line_contig.split("\t") curr_id = int(data[0]) tag = data[0] + "-" + data[1] start_pos = int(data[2]) end_pos = int(data[3]) size = int(data[4]) gc_content = float(data[5]) n_accu_frags = int(data[6]) id_init_frag_start = int(data[7]) id_init_frag_end = int(data[8]) if level > 0: id_sub_frag_start = int(data[9]) id_sub_frag_end = int(data[10]) else: id_sub_frag_start = int(curr_id) id_sub_frag_end = int(curr_id) fragments_info[id] = dict() contig_name = data[1] fragments_info[id]["init_contig"] = contig_name fragments_info[id]["index"] = int(curr_id) fragments_info[id]["tag"] = tag fragments_info[id]["start_pos(bp)"] = start_pos fragments_info[id]["end_pos(bp)"] = end_pos fragments_info[id]["size(bp)"] = size fragments_info[id]["sub_low_index"] = id_sub_frag_start fragments_info[id]["sub_high_index"] = id_sub_frag_end fragments_info[id]["super_index"] = curr_id fragments_info[id]["n_accu_frags"] = n_accu_frags if not (contig_name in contig_dict): id_contig += 1 list_contigs.append(contig_name) list_contigs_id.append(id_contig) contig_dict[contig_name] = dict() contig_dict[contig_name]["frag"] = [] contig_dict[contig_name]["id_contig"] = id_contig contig_dict[id_contig] = [] f = B_frag.initiate( id, curr_id, contig_name, curr_id, start_pos, end_pos, size, gc_content, id_init_frag_start, id_init_frag_end, id_sub_frag_start, id_sub_frag_end, curr_id, id_contig, n_accu_frags, ) contig_dict[contig_name]["frag"].append(f) contig_dict[id_contig].append(f) id += 1 return fragments_info, contig_dict, list_contigs, list_contigs_id
[docs] def update_super_index(self, dict_frag, super_index_file): handle_super_index = open(super_index_file, "r") handle_super_index.readline() while 1: line_index = handle_super_index.readline() if not line_index: handle_super_index.close() break data = line_index.split("\t") dict_frag[int(data[0])]["super_index"] = int(data[1])
[docs] def update_super_index_in_dict_contig(self, dict_frag, dict_contig): set_contig = set() for id in list(dict_frag.keys()): id_frag = id frag = dict_frag[id_frag] init_contig = dict_frag[id_frag]["init_contig"] set_contig.add(init_contig) id_contig = dict_contig[init_contig]["id_contig"] f = dict_contig[id_contig][frag["index"] - 1] f.super_index = frag["super_index"] # print "set conrigs!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # print set_contig for ele in set_contig: dict_contig.pop(ele)
[docs] def zoom_in_frag(self, curr_frag): """ :param curr_frag: """ level = curr_frag[1] frag = curr_frag[0] output = [] if level > 0: str_level = str(level) sub_low = self.spec_level[str_level]["fragments_dict"][frag][ "sub_low_index" ] sub_high = self.spec_level[str_level]["fragments_dict"][frag][ "sub_high_index" ] new_level = level - 1 for i in range(sub_low, sub_high + 1): output.append((i, new_level)) else: output.append(curr_frag) return output
[docs] def zoom_out_frag(self, curr_frag): """ :param curr_frag: """ level = curr_frag[1] frag = curr_frag[0] output = [] if level > 0: str_level = str(level) high_frag = self.spec_level[str_level]["fragments_dict"][frag][ "super_index" ] new_level = level + 1 output = (high_frag, new_level) else: output = curr_frag return output
[docs] def full_zoom_in_frag(self, curr_frag): """ :param curr_frag: """ level = curr_frag[1] frag = curr_frag[0] output = [] if level > 0: str_level = str(level) sub_low = self.spec_level[str_level]["fragments_dict"][frag][ "sub_low_index" ] sub_high = self.spec_level[str_level]["fragments_dict"][frag][ "sub_high_index" ] new_level = level - 1 for i in range(sub_low, sub_high + 1): output.append((i, new_level)) else: output.append(curr_frag) return output
[docs] def zoom_in_pixel(self, curr_pixel): """ return the curr_frag at a higher resolution""" low_frag = curr_pixel[0] high_frag = curr_pixel[1] level = curr_pixel[2] if level > 0: str_level = str(level) low_sub_low = self.spec_level[str_level]["fragments_dict"][ low_frag ]["sub_low_index"] low_sub_high = self.spec_level[str_level]["fragments_dict"][ low_frag ]["sub_high_index"] high_sub_low = self.spec_level[str_level]["fragments_dict"][ high_frag ]["sub_low_index"] high_sub_high = self.spec_level[str_level]["fragments_dict"][ high_frag ]["sub_high_index"] vect = [low_sub_low, low_sub_high, high_sub_low, high_sub_high] new_pix_low = min(vect) new_pix_high = max(vect) new_level = level - 1 new_pixel = [new_pix_low, new_pix_high, new_level] else: new_pixel = curr_pixel return new_pixel
[docs] def zoom_out_pixel(self, curr_pixel): """ return the curr_frag at a lower resolution""" low_frag = curr_pixel[0] high_frag = curr_pixel[1] level = curr_pixel[2] str_level = str(level) if level < self.n_level - 1: low_super = self.spec_level[str_level]["fragments_dict"][low_frag][ "super_index" ] high_super = self.spec_level[str_level]["fragments_dict"][ high_frag ]["sub_index"] new_pix_low = min([low_super, high_super]) new_pix_high = max([low_super, high_super]) new_level = level + 1 new_pixel = [new_pix_low, new_pix_high, new_level] else: new_pixel = curr_pixel return new_pixel
[docs] def zoom_in_area(self, area): """ zoom in area""" x = area[0] y = area[1] level = x[2] logger.debug("x = {}".format(x)) logger.debug("y = {}".format(y)) logger.debug("level = {}".format(level)) if level == y[2] and level > 0: new_level = level - 1 high_x = self.zoom_in_pixel(x) high_y = self.zoom_in_pixel(y) new_x = [ min([high_x[0], high_y[0]]), min([high_x[1], high_y[1]]), new_level, ] new_y = [ max([high_x[0], high_y[0]]), max([high_x[1], high_y[1]]), new_level, ] new_area = [new_x, new_y] else: new_area = area return new_area
[docs] def load_reference_sequence(self, genome_fasta): logger.info("import reference genome") f = open(genome_fasta, "r") self.dict_sequence_contigs = dict() all_lines = f.readlines() id_chrom = all_lines[0][1:-1].replace("\n", "").replace("\r", "") self.dict_sequence_contigs[id_chrom] = "" start = 1 chrom_list = [] for i in range(1, len(all_lines)): if all_lines[i][0] == ">": # print id_chrom chrom_list.append(id_chrom) self.dict_sequence_contigs[id_chrom] = "".join( all_lines[start:i] ) start = i + 1 id_chrom = ( all_lines[i][1:-1].replace("\n", "").replace("\r", "") ) self.dict_sequence_contigs[id_chrom] = "" chrom_list.append(id_chrom) self.dict_sequence_contigs[id_chrom] = "".join(all_lines[start:-1]) chrom_list = list(self.dict_sequence_contigs.keys()) # chrom_list.sort() for chrom in chrom_list: self.dict_sequence_contigs[chrom] = self.dict_sequence_contigs[ chrom ].replace("\n", "") self.dict_sequence_contigs[chrom] = self.dict_sequence_contigs[ chrom ].replace("\r", "") f.close()
# print chrom_list
[docs]class level: def __init__(self, pyramid, level): self.level = level self.T_frag = np.dtype( [ ("pos", np.int32), ("id_c", np.int32), ("start_bp", np.int32), ("len_bp", np.int32), ("circ", np.int32), ("id", np.int32), ("prev", np.int32), ("next", np.int32), ("l_cont", np.int32), ("l_cont_bp", np.int32), ("n_accu", np.int32), ], align=True, ) self.float4 = np.dtype( [ ("x", np.float32), ("y", np.float32), ("z", np.float32), ("w", np.float32), ], align=True, ) self.S_o_A_frags = {} self.S_o_A_frags["pos"] = [] self.S_o_A_frags["sub_pos"] = [] self.S_o_A_frags["id_c"] = [] self.S_o_A_frags["start_bp"] = [] self.S_o_A_frags["len_bp"] = [] self.S_o_A_frags["sub_len"] = [] self.S_o_A_frags["circ"] = [] self.S_o_A_frags["id"] = [] self.S_o_A_frags["prev"] = [] self.S_o_A_frags["next"] = [] self.S_o_A_frags["l_cont"] = [] self.S_o_A_frags["sub_l_cont"] = [] self.S_o_A_frags["l_cont_bp"] = [] self.S_o_A_frags["n_accu"] = [] self.vect_frag_np = [] self.frags_init_contigs = [] # self.pos_vect_frags_4_GL = [] # self.col_vect_frags_4_GL = [] self.load_data(pyramid) self.pyramid = pyramid
[docs] def load_data(self, pyramid): """ :param pyramid: hic pyramid """ import colorsys logger.info("loading data from level = {}".format(self.level)) # self.im_init = np.array(pyramid.data[str(self.level)], # dtype=np.int32) # self.n_frags = self.im_init.shape[0] # start loading sparse matrix ##### self.n_frags = int(np.copy(pyramid.data[str(self.level)]["nfrags"][0])) self.np_2_scipy_sparse = np.copy(pyramid.data[str(self.level)]["data"]) self.sparse_mat_csr = sp.csr_matrix( (self.np_2_scipy_sparse[2, :], self.np_2_scipy_sparse[0:2, :]), shape=(self.n_frags, self.n_frags), ) self.sparse_mat_csc = sp.csc_matrix( (self.np_2_scipy_sparse[2, :], self.np_2_scipy_sparse[0:2, :]), shape=(self.n_frags, self.n_frags), ) # end loading sparse matrix ##### int_n_frags = np.int32(self.n_frags) self.pos_vect_frags_4_GL = np.ndarray( (int_n_frags, 4), dtype=np.float32 ) self.col_vect_frags_4_GL = np.ndarray( (int_n_frags, 4), dtype=np.float32 ) self.dict_contigs = dict() ContFrags = pyramid.spec_level[str(self.level)]["contigs_dict"] if str(self.level - 1) in pyramid.spec_level: subContFrags = pyramid.spec_level[str(self.level - 1)][ "contigs_dict" ] else: subContFrags = ContFrags coord_cont = dict() n_contigs = len(list(ContFrags.keys())) # print "n init contigs = ", n_contigs HSV_tuples = [ (x * 2.5 / n_contigs, 0.5, 0.5) for x in range(n_contigs) ] RGB_tuples = [colorsys.hsv_to_rgb(*x) for x in HSV_tuples] self.distri_frag = [] # list_contigs = ContFrags.keys() # print "Cont frags keys = ", ContFrags.keys() list_contigs_id = pyramid.list_contigs_id # debugggg # list_contigs.sort() for i in range(0, self.n_frags): self.frags_init_contigs.append("") for id_cont in list_contigs_id: self.dict_contigs[id_cont] = dict() cont_frag = ContFrags[id_cont] sub_cont_frag = subContFrags[id_cont] coord_cont[id_cont] = [] name_contig = cont_frag[0].init_contig tick_kb = [] end_frags_kb = [] id_f_prev = -1 lim_curr_cont = len(cont_frag) sub_lim_curr_cont = len(sub_cont_frag) len_curr_contig_bp = 0 curr_sub_pos = 0 for id_f in range(0, lim_curr_cont): f = cont_frag[id_f] lkb = f.length_kb len_curr_contig_bp += lkb for id_f in range(0, lim_curr_cont): f = cont_frag[id_f] lkb = f.length_kb self.distri_frag.append(lkb) start = f.start_pos end = f.end_pos pos_kb = start + lkb / 2. tick_kb.append(pos_kb) end_frags_kb.append(end) id_f_curr = f.np_id_abs - 1 n_accu_frags = f.n_accu_frags n_sub_frags = f.sub_frag_end - f.sub_frag_start + 1 self.frags_init_contigs[id_f_curr] = f.init_contig if id_f < lim_curr_cont - 1: id_f_next = cont_frag[id_f + 1].np_id_abs - 1 else: id_f_next = -1 f_T_frag = ( np.int32(f.curr_id - 1), np.int32(f.contig_id), np.int32(start), np.int32(lkb), np.int32(0), np.int32(id_f_curr), np.int32(id_f_prev), np.int32(id_f_next), np.int32(lim_curr_cont), np.int32(len_curr_contig_bp), np.int32(n_accu_frags), ) self.pos_vect_frags_4_GL[id_f_curr, 0] = ( np.float32(f.curr_id - 1) / 100. ) self.pos_vect_frags_4_GL[id_f_curr, 1] = ( np.float32(f.contig_id) / 100. ) # self.pos_vect_frags_4_GL[id_f_curr, 2] = np.float32(0.0) self.pos_vect_frags_4_GL[id_f_curr, 2] = 0. self.pos_vect_frags_4_GL[id_f_curr, 3] = np.float32(1.0) # GL_pos_frag = (np.float32(f.curr_id - 1), # np.float32(f.contig_id), # np.float32(lim_curr_cont), # np.float32(0.0)) # self.col_vect_frags_4_GL[id_f_curr,0] = np.float32(0.) # self.col_vect_frags_4_GL[id_f_curr,1] = np.float32(1.) # self.col_vect_frags_4_GL[id_f_curr,2] = np.float32(0.) # self.col_vect_frags_4_GL[id_f_curr,3] = np.float32(1.) self.col_vect_frags_4_GL[id_f_curr, 0] = np.float32( RGB_tuples[id_cont - 1][0] ) self.col_vect_frags_4_GL[id_f_curr, 1] = np.float32( RGB_tuples[id_cont - 1][1] ) self.col_vect_frags_4_GL[id_f_curr, 2] = np.float32( RGB_tuples[id_cont - 1][2] ) self.col_vect_frags_4_GL[id_f_curr, 3] = np.float32(1.0) # GL_col_frag = (np.float32(RGB_tuples[id_cont - 1][0], # np.float32(RGB_tuples[id_cont - 1][1]), # np.float32(RGB_tuples[id_cont - 1][2]), # np.float32(1.0)) self.S_o_A_frags["pos"].append(f.curr_id - 1) self.S_o_A_frags["sub_pos"].append(curr_sub_pos) # TO DO !!!! self.S_o_A_frags["id_c"].append(f.contig_id) self.S_o_A_frags["start_bp"].append(start) self.S_o_A_frags["len_bp"].append(lkb) self.S_o_A_frags["sub_len"].append(n_sub_frags) self.S_o_A_frags["circ"].append(0) self.S_o_A_frags["id"].append(id_f_curr) self.S_o_A_frags["prev"].append(id_f_prev) self.S_o_A_frags["next"].append(id_f_next) self.S_o_A_frags["l_cont"].append(lim_curr_cont) self.S_o_A_frags["sub_l_cont"].append(sub_lim_curr_cont) self.S_o_A_frags["l_cont_bp"].append(len_curr_contig_bp) self.S_o_A_frags["n_accu"].append(n_accu_frags) id_f_prev = id_f_curr self.vect_frag_np.append(f_T_frag) curr_sub_pos += n_sub_frags # self.pos_vect_frags_4_GL.append(GL_pos_frag) # self.col_vect_frags_4_GL.append(GL_col_frag) for f in cont_frag: coord_cont[id_cont].append(f.np_id_abs - 1) self.dict_contigs[id_cont]["intra_coord"] = coord_cont[id_cont] self.dict_contigs[id_cont]["frags"] = cont_frag self.dict_contigs[id_cont]["name"] = name_contig self.dict_contigs[id_cont]["tick_kb"] = np.array(tick_kb) self.dict_contigs[id_cont]["end_frags_kb"] = np.array(end_frags_kb) ####################### self.vect_frag_np = np.array(self.vect_frag_np, dtype=self.T_frag) # self.pos_vect_frags_4_GL = np.array(self.pos_vect_frags_4_GL, # dtype=self.float4) self.col_vect_frags_4_GL = # np.array(self.col_vect_frags_4_GL, dtype=self.float4) for key in list(self.S_o_A_frags.keys()): self.S_o_A_frags[key] = np.array( self.S_o_A_frags[key], dtype=np.int32 ) total_trans = 0 n_tot = 0 n_tot_intra = 0 for id_cont in range(1, len(self.dict_contigs) + 1): sub_sp_mat_tmp = self.sparse_mat_csr[coord_cont[id_cont], :] n_full_contact = sub_sp_mat_tmp.sum() sub_sp_mat_tmp_csc = sub_sp_mat_tmp.tocsc() sub_sp_mat_tmp_intra = sub_sp_mat_tmp_csc[:, coord_cont[id_cont]] shape_test = sub_sp_mat_tmp_intra.shape test = shape_test[0] == shape_test[1] if not test: logger.info("MIERDA!!!!") n_intra = sub_sp_mat_tmp_intra.sum() total_trans += n_full_contact total_trans -= n_intra # n_tot += (sub_sp_mat_tmp.shape[0] * sub_sp_mat_tmp.shape[1]) - # sub_sp_mat_tmp_intra.shape[0]*sub_sp_mat_tmp_intra.shape[1] n_tot_intra += shape_test[0] * (shape_test[0] - 1) / 2 # for id_cont in xrange(1, len(self.dict_contigs)+1): # full = self.im_init[coord_cont[id_cont], :] # intra = self.im_init[np.ix_(coord_cont[id_cont], # coord_cont[id_cont])] # total_trans += full.sum() # total_trans -= intra.sum() # n_tot += (full.shape[0]*full.shape[1]) - # intra.shape[0]*intra.shape[1] n_tot = ( self.sparse_mat_csr.shape[0] * (self.sparse_mat_csr.shape[0] - 1) / 2 - n_tot_intra ) self.mean_value_trans = total_trans / np.float32(n_tot) if np.isnan(self.mean_value_trans): self.mean_value_trans = np.amin(self.sparse_mat_csr.data) / 10.0 logger.debug( "computed mean trans value ... = {}".format(self.mean_value_trans) ) self.distri_frag = np.array(self.distri_frag) # self.define_inter_chrom_coord() # self.init_data() self.n_contigs = len(self.dict_contigs)
[docs] def init_data(self,): logger.info("init data {}".format(self.level)) if np.__version__ == "1.7.1" or np.__version__ == "1.8.0.dev-1a9aa5a": tmp = np.empty_like(self.im_init) np.copyto(tmp, self.im_init) else: tmp = np.copy(self.im_init) tmp = np.float32(tmp) self.im_curr = tmp
[docs] def define_inter_chrom_coord(self): """ """ self.inter_coord = dict() self.all_data = dict() logger.info("define inter chrom coord ...") for id_cont_X in self.dict_contigs: if not (id_cont_X == 17): self.inter_coord[id_cont_X] = dict() self.all_data[id_cont_X] = dict() coord_intra_X = self.dict_contigs[id_cont_X]["intra_coord"] self.inter_coord[id_cont_X]["all"] = np.ix_( coord_intra_X, np.arange(0, self.n_frags) ) for id_cont_Y in self.dict_contigs: if not (id_cont_Y == 17): self.all_data[id_cont_X][id_cont_Y] = dict() coord_intra_Y = self.dict_contigs[id_cont_Y][ "intra_coord" ] self.inter_coord[id_cont_X][id_cont_Y] = np.ix_( coord_intra_X, coord_intra_Y ) logger.info("done!")
[docs] def build_seq_per_bin(self, genome_fasta): self.pyramid.load_reference_sequence(genome_fasta) ContFrags = self.pyramid.spec_level[str(self.level)]["contigs_dict"] list_contigs = list(ContFrags.keys()) list_contigs.sort() list_contigs_tmp = list( self.pyramid.spec_level[str(self.level)]["contigs_dict"].keys() ) # print "list contigs = ", list_contigs list_contigs = [] for ele in list_contigs_tmp: if not isinstance(ele, str): list_contigs.append(ele) list_contigs.sort() self.list_seq = [] for cont in list_contigs: for frag in ContFrags[cont]: start = frag.start_pos end = frag.end_pos # print "init contig = ", frag.init_contig init_contig = frag.init_contig seq_frag = self.pyramid.dict_sequence_contigs[init_contig][ start:end ] # print "len seq per frag = ", len(seq_frag) self.list_seq.append(seq_frag)
[docs] def generate_new_fasta(self, vect_frags, new_fasta, info_frags): logger.info("generate new fasta file...") id_c_frag = vect_frags.id_c pos_frag = vect_frags.pos ori_frag = vect_frags.ori activ_frag = vect_frags.activ handle_new_fasta = open(new_fasta, "w") handle_info_frags = open(info_frags, "w") list_id_contigs = np.unique(id_c_frag) # n_new_contigs = len(list_id_contigs) list_seq_new_contigs = dict() list_contigs_ok = [] for id_cont in list_id_contigs: list_frags = np.nonzero(id_c_frag == id_cont)[0] if np.all(activ_frag[list_frags] == 1): # print "id contig = ", id_cont list_contigs_ok.append(id_cont) header = ">3C-assembly|contig_" + str(id_cont) handle_info_frags.write("%s\n" % (header)) handle_info_frags.write( "%s\t%s\t%s\t%s\t%s\n" % ("init_contig", "id_frag", "orientation", "start", "end") ) new_positions = pos_frag[list_frags] ordered_id_frags = list_frags[np.argsort(new_positions)] list_seq_new_contigs[id_cont] = "" for f in ordered_id_frags: ori = ori_frag[f] init_frag_id = vect_frags.id_d[f] init_contig = self.frags_init_contigs[init_frag_id] start_bp = self.pyramid.spec_level[str(self.level)][ "fragments_dict" ][init_frag_id + 1]["start_pos(bp)"] end_bp = self.pyramid.spec_level[str(self.level)][ "fragments_dict" ][init_frag_id + 1]["end_pos(bp)"] extract_seq = self.pyramid.dict_sequence_contigs[ init_contig ][start_bp:end_bp] if ori == -1: seq = extract_seq[::-1] seq = seq.translate( str.maketrans("TAGCtagc", "ATCGATCG") ) else: seq = extract_seq handle_info_frags.write( "%s\t%s\t%s\t%s\t%s\n" % ( init_contig, str(init_frag_id), str(ori), str(start_bp), str(end_bp), ) ) list_seq_new_contigs[id_cont] += seq for id_cont in list_contigs_ok: cont_seq = list_seq_new_contigs[id_cont] header = ">3C-assembly|contig_" + str(id_cont) # print header handle_new_fasta.write("%s\n" % (header)) len_line = 61 len_seq = len(cont_seq) if len_seq > 0: idx_cut_EOL = list(range(0, len_seq, len_line)) for id_s in range(1, len(idx_cut_EOL)): line = cont_seq[idx_cut_EOL[id_s - 1] : idx_cut_EOL[id_s]] handle_new_fasta.write("%s\n" % (line)) if idx_cut_EOL[-1] != len_seq - 1: line = cont_seq[idx_cut_EOL[-1] :] handle_new_fasta.write("%s\n" % (line)) handle_new_fasta.close() handle_info_frags.close()
[docs]def main(): try: (in_file, out_file) = sys.argv[1:] except IndexError: print("Usage: ./pyramid_sparse.py in_file out_file") quit() abs_contact_2_coo_file(in_file, out_file)
if __name__ == "__main__": main()