#!/usr/bin/env python
# coding: utf-8
"""Genetic map based validation
Aggregate genetic linkage data with an existing assembly for correction and
polishing purposes.
Usage:
linkage.py <linkage.csv> <info_frags.txt> [--output <output_file>]
[--fasta <reference.fa>]
Options:
-h, --help Display this help message.
--version Display the program's current version.
-o, --output Prefix of output files
-f, --fasta If a fasta reference is supplied, also write
the corresponding genome from info_frags.txt
"""
import functools
import itertools
import operator
import collections
import copy
import docopt
import numpy as np
from instagraal import parse_info_frags
parse_linkage_csv = functools.partial(
np.genfromtxt, dtype=None, skip_header=1, encoding="utf-8"
)
[docs]def collapse_degenerate_markers(linkage_records):
"""Group all markers with no genetic distance as distinct features
to generate a BED file with.
Simple example with sixteen degenerate markers:
>>> marker_features = [
... ['36915_sctg_207_31842', 1, 0, 207, 31842],
... ['36941_sctg_207_61615', 1, 0, 207, 61615],
... ['36956_sctg_207_77757', 1, 0, 207, 77757],
... ['36957_sctg_207_78332', 1, 0, 207, 78332],
... ['36972_sctg_207_94039', 1, 0, 207, 94039],
... ['36788_sctg_207_116303', 1, 0.652, 207, 116303],
... ['36812_sctg_207_158925', 1, 1.25, 207, 158925],
... ['36819_sctg_207_165424', 1, 1.25, 207, 165424],
... ['36828_sctg_207_190813', 1, 1.25, 207, 190813],
... ['36830_sctg_207_191645', 1, 1.25, 207, 191645],
... ['36834_sctg_207_195961', 1, 1.25, 207, 195961],
... ['36855_sctg_207_233632', 1, 1.25, 207, 233632],
... ['36881_sctg_207_258658', 1, 1.25, 207, 258658],
... ['82072_sctg_486_41893', 1, 3.756, 486, 41893],
... ['85634_sctg_516_36614', 1, 3.756, 516, 36614],
... ['85638_sctg_516_50582', 1, 3.756, 516, 50582]]
>>> len(marker_features)
16
>>> collapsed_features = collapse_degenerate_markers(marker_features)
>>> len(collapsed_features)
5
The degenerate features (identical linkage group, genetic distance and
original scaffold) are collapsed into a region:
>>> collapsed_features[0]
[1, 31842, 94039, 207]
The format is [linkage group, start, end, original scaffold].
If a singleton (non-degenerate) feature is found, the region is simply
a single point in the genome:
>>> collapsed_features[1]
[1, 116303, 116303, 207]
so 'start' and 'end' are identical.
Two markers are not considered degenerate if they belong to different
original scaffolds, even if they are in terms of genetic linkage:
>>> collapsed_features[2]
[1, 158925, 258658, 207]
>>> collapsed_features[3:]
[[1, 41893, 41893, 486], [1, 36614, 50582, 516]]
"""
def degeneracy(linkage_record):
linkage_group, genetic_distance, scaffold = (
linkage_record[1],
linkage_record[2],
linkage_record[3],
)
return (linkage_group, genetic_distance, scaffold)
degenerate_records = []
for _, degenerate_group in itertools.groupby(
linkage_records, key=degeneracy
):
group_list = list(degenerate_group)
start_record, end_record = group_list[0], group_list[-1]
assert (start_record[1], start_record[2], start_record[3]) == (
end_record[1],
end_record[2],
end_record[3],
)
start_position = start_record[-1]
end_position = end_record[-1]
scaffold = start_record[3]
linkage_group = start_record[1]
record = [linkage_group, start_position, end_position, scaffold]
degenerate_records.append(record)
return degenerate_records
[docs]def write_bed(records, output_file):
with open(output_file, "w") as bed_handle:
for record in records:
line = "\t".join((str(field) for field in record))
bed_handle.write("{}\n".format(line))
[docs]def linkage_group_ordering(linkage_records):
"""Convert degenerate linkage records into ordered info_frags-like records
for comparison purposes.
Simple example:
>>> linkage_records = [
... ['linkage_group_1', 31842, 94039, 'sctg_207'],
... ['linkage_group_1', 95303, 95303, 'sctg_207'],
... ['linkage_group_2', 15892, 25865, 'sctg_308'],
... ['linkage_group_2', 41893, 41893, 'sctg_486'],
... ['linkage_group_3', 36614, 50582, 'sctg_516'],
... ]
>>> ordering = linkage_group_ordering(linkage_records)
Each key of the record is a newly-formed 'scaffold' (linkage group):
>>> sorted(ordering.keys())
['linkage_group_1', 'linkage_group_2', 'linkage_group_3']
Records are in the form [init_contig, frag_id, start, end, orientation].
Since fragment ids are meaningless in non-HiC contexts a negative
identifier is set so it is understood that region was added due to
linkage data (-1 is for recovering data after first-pass polishing and -2
is for sequence insertions after long read based polishing).
>>> ordering['linkage_group_1']
[['sctg_207', -3, 31842, 94039, 1], ['sctg_207', -3, 95303, 95303, 1]]
>>> ordering['linkage_group_2']
[['sctg_308', -3, 15892, 25865, 1], ['sctg_486', -3, 41893, 41893, 1]]
Orientations are always set to 1 by default.
>>> ordering['linkage_group_3']
[['sctg_516', -3, 36614, 50582, 1]]
"""
new_records = dict()
for lg_name, linkage_group in itertools.groupby(
linkage_records, operator.itemgetter(0)
):
new_records[lg_name] = []
for record in linkage_group:
init_contig = record[-1]
start = record[1]
end = record[2]
new_record = [init_contig, -3, start, end, 1]
new_records[lg_name].append(new_record)
return new_records
[docs]def compare_orderings(info_frags_records, linkage_orderings):
"""Given linkage groups and info_frags records, link pseudo-chromosomes to
scaffolds based on the initial contig composition of each group. Because
info_frags records are usually richer and may contain contigs not found
in linkage groups, those extra sequences are discarded.
Example with two linkage groups and two chromosomes:
>>> linkage_orderings = {
... 'linkage_group_1': [
... ['sctg_516', -3, 36614, 50582, 1],
... ['sctg_486', -3, 41893, 41893, 1],
... ['sctg_486', -3, 50054, 62841, 1],
... ['sctg_207', -3, 31842, 94039, 1],
... ['sctg_558', -3, 51212, 54212, 1],
... ],
... 'linkage_group_2': [
... ['sctg_308', -3, 15892, 25865, 1],
... ['sctg_842', -3, 0, 8974, 1],
... ['sctg_994', -3, 0, 81213, 1],
... ],
... }
>>> info_frags = {
... 'scaffold_A': [
... ['sctg_308', 996, 15892, 25865, 1],
... ['sctg_778', 1210, 45040, 78112, -1],
... ['sctg_842', 124, 0, 8974, 1],
... ],
... 'scaffold_B': [
... ['sctg_516', 5, 0, 38000, 1],
... ['sctg_486', 47, 42050, 49000, 1],
... ['sctg_1755', 878, 95001, 10844, -1],
... ['sctg_842', 126, 19000, 26084, 1],
... ['sctg_207', 705, 45500, 87056, 1],
... ],
... 'scaffold_C': [
... ['sctg_558', 745, 50045, 67851, 1],
... ['sctg_994', 12, 74201, 86010, -1],
... ],
... }
>>> matching_pairs = compare_orderings(info_frags, linkage_orderings)
>>> matching_pairs['scaffold_B']
(3, 'linkage_group_1', {'sctg_558': 'sctg_207'})
>>> matching_pairs['scaffold_A']
(2, 'linkage_group_2', {'sctg_994': 'sctg_842'})
"""
scaffolds = info_frags_records.keys()
linkage_groups = linkage_orderings.keys()
best_matching_table = dict()
for scaffold, linkage_group in itertools.product(
scaffolds, linkage_groups
):
lg_ordering = [
init_contig
for init_contig, _ in itertools.groupby(
linkage_orderings[linkage_group], operator.itemgetter(0)
)
]
scaffold_ordering = [
init_contig
for init_contig, bin_group in itertools.groupby(
info_frags_records[scaffold], operator.itemgetter(0)
)
if init_contig in lg_ordering
]
overlap = set(lg_ordering).intersection(set(scaffold_ordering))
missing_locations = dict()
for missing_block in sorted(set(lg_ordering) - set(overlap)):
for i, init_contig in enumerate(lg_ordering):
if init_contig == missing_block:
try:
block_before = lg_ordering[i - 1]
except IndexError:
block_before = "beginning"
missing_locations[missing_block] = block_before
try:
if len(overlap) > best_matching_table[scaffold][0]:
best_matching_table[scaffold] = (
len(overlap),
linkage_group,
missing_locations,
)
except KeyError:
best_matching_table[scaffold] = (
len(overlap),
linkage_group,
missing_locations,
)
return best_matching_table
[docs]def get_missing_blocks(info_frags_records, matching_pairs, linkage_orderings):
"""Get missing blocks in a scaffold based on the genetic map order.
Given matching scaffold blocks/genetic map blocks (based on restriction
sites and SNP markers, respectively), move around the scaffold blocks
such that they map the genetic map order.
Parameters
----------
info_frags_records : dict
A dictionary representing the scaffolds and their block order as
described in an info_frags.txt file
matching_pairs : dict
A list of best matching pairs in the form
(scaffold_block, linkage group)
linkage_orderings : dict
A dictionary representing the genetic map and the linkage groups
as described in csv file.
Example
-------
>>> linkage_orderings = {
... 'linkage_group_1': [
... ['sctg_516', -3, 36614, 50582, 1],
... ['sctg_486', -3, 41893, 41893, 1],
... ['sctg_486', -3, 50054, 62841, 1],
... ['sctg_207', -3, 31842, 94039, 1],
... ['sctg_558', -3, 51212, 54212, 1],
... ],
... 'linkage_group_2': [
... ['sctg_308', -3, 15892, 25865, 1],
... ['sctg_842', -3, 0, 8974, 1],
... ['sctg_994', -3, 0, 81213, 1],
... ],
... }
>>> info_frags = {
... 'scaffold_A': [
... ['sctg_308', 996, 15892, 25865, 1],
... ['sctg_778', 1210, 45040, 78112, -1],
... ['sctg_842', 124, 0, 8974, 1],
... ],
... 'scaffold_B': [
... ['sctg_516', 5, 0, 38000, 1],
... ['sctg_486', 47, 42050, 49000, 1],
... ['sctg_1755', 878, 95001, 10844, -1],
... ['sctg_842', 126, 19000, 26084, 1],
... ['sctg_207', 705, 45500, 87056, 1],
... ],
... 'scaffold_C': [
... ['sctg_558', 745, 50045, 67851, 1],
... ['sctg_994', 12, 74201, 86010, -1],
... ],
... }
>>> matching_pairs = compare_orderings(info_frags, linkage_orderings)
>>> new_records = get_missing_blocks(info_frags, matching_pairs,
... linkage_orderings)
>>> for my_bin in new_records['scaffold_A']:
... print(list(my_bin))
...
['sctg_308', 996, 15892, 25865, 1]
['sctg_778', 1210, 45040, 78112, -1]
['sctg_842', 124, 0, 8974, 1]
['sctg_842', 126, 19000, 26084, 1]
['sctg_994', 12, 74201, 86010, -1]
>>> for my_bin in new_records['scaffold_B']:
... print(list(my_bin))
...
['sctg_516', 5, 0, 38000, 1]
['sctg_486', 47, 42050, 49000, 1]
['sctg_1755', 878, 95001, 10844, -1]
['sctg_207', 705, 45500, 87056, 1]
['sctg_558', 745, 50045, 67851, 1]
"""
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
touched_lgs = set()
def record_length(key_record_tuple):
return len(key_record_tuple[1])
new_scaffolds = copy.deepcopy(info_frags_records)
for scaffold_name in collections.OrderedDict(
sorted(new_scaffolds.items(), key=record_length, reverse=True)
):
scaffold = new_scaffolds[scaffold_name]
new_scaffold = []
corresponding_lg = matching_pairs[scaffold_name][1]
if corresponding_lg in touched_lgs:
continue
else:
touched_lgs.add(corresponding_lg)
scaffold_block_names = {my_bin[0] for my_bin in scaffold}
lg_block_names = [
my_block[0] for my_block in linkage_orderings[corresponding_lg]
]
touched_bins = set()
for lg_block_name in lg_block_names:
if lg_block_name in scaffold_block_names:
for my_bin in scaffold:
if tuple(my_bin) in new_scaffold:
continue
elif (
my_bin[0] == lg_block_name
or my_bin[0] not in lg_block_names
):
new_scaffold.append(tuple(my_bin))
touched_bins.add(tuple(my_bin))
else:
break
for other_name, other_scaffold in new_scaffolds.items():
if other_name == scaffold_name:
continue
i = 0
for my_bin in other_scaffold:
if tuple(my_bin) in new_scaffold:
i += 1
continue
elif my_bin[0] == lg_block_name:
moved_bin = tuple(other_scaffold.pop(i))
new_scaffold.append(tuple(moved_bin))
touched_bins.add(tuple(moved_bin))
i -= 1
i += 1
for remaining_bin in scaffold:
if tuple(remaining_bin) not in touched_bins:
new_scaffold.append(tuple(remaining_bin))
touched_bins.add(tuple(remaining_bin))
if len(new_scaffold) > 0:
new_scaffolds[scaffold_name] = new_scaffold
return new_scaffolds
[docs]def main():
arguments = docopt.docopt(__doc__, version=VERSION_NUMBER)
linkage_file = arguments["<linkage.csv>"]
info_frags_file = arguments["<info_frags.txt>"]
output = arguments["<output_file>"]
fasta = arguments["--fasta"]
output_fasta = "{}_linkage.fa".format(output)
linkage_csv = parse_linkage_csv(linkage_file)
info_frags_records = parse_info_frags.parse_info_frags(info_frags_file)
collapsed_linkage = collapse_degenerate_markers(linkage_csv)
group_ordering = linkage_group_ordering(collapsed_linkage)
matching_table = compare_orderings(info_frags_records, group_ordering)
new_scaffolds = get_missing_blocks(
info_frags_records, matching_table, group_ordering
)
parse_info_frags.write_info_frags(new_scaffolds, output=output)
if fasta:
parse_info_frags.write_fasta(fasta, new_scaffolds, output=output_fasta)
if __name__ == "__main__":
main()