Source code for instagraal.linkage

#!/usr/bin/env python
# coding: utf-8

"""Genetic map based validation

Aggregate genetic linkage data with an existing assembly for correction and
polishing purposes.

Usage:
    linkage.py <linkage.csv> <info_frags.txt> [--output <output_file>]
                                              [--fasta <reference.fa>]

Options:
    -h, --help              Display this help message.
    --version               Display the program's current version.
    -o, --output            Prefix of output files
    -f, --fasta             If a fasta reference is supplied, also write
                            the corresponding genome from info_frags.txt

"""

import functools
import itertools
import operator
import collections
import copy
import docopt
import numpy as np
from instagraal import parse_info_frags

parse_linkage_csv = functools.partial(
    np.genfromtxt, dtype=None, skip_header=1, encoding="utf-8"
)


[docs]def collapse_degenerate_markers(linkage_records):

    """Group all markers with no genetic distance as distinct features
    to generate a BED file with.

    Simple example with sixteen degenerate markers:

        >>> marker_features = [
        ... ['36915_sctg_207_31842', 1, 0, 207, 31842],
        ... ['36941_sctg_207_61615', 1, 0, 207, 61615],
        ... ['36956_sctg_207_77757', 1, 0, 207, 77757],
        ... ['36957_sctg_207_78332', 1, 0, 207, 78332],
        ... ['36972_sctg_207_94039', 1, 0, 207, 94039],
        ... ['36788_sctg_207_116303', 1, 0.652, 207, 116303],
        ... ['36812_sctg_207_158925', 1, 1.25, 207, 158925],
        ... ['36819_sctg_207_165424', 1, 1.25, 207, 165424],
        ... ['36828_sctg_207_190813', 1, 1.25, 207, 190813],
        ... ['36830_sctg_207_191645', 1, 1.25, 207, 191645],
        ... ['36834_sctg_207_195961', 1, 1.25, 207, 195961],
        ... ['36855_sctg_207_233632', 1, 1.25, 207, 233632],
        ... ['36881_sctg_207_258658', 1, 1.25, 207, 258658],
        ... ['82072_sctg_486_41893', 1, 3.756, 486, 41893],
        ... ['85634_sctg_516_36614', 1,	3.756, 516, 36614],
        ... ['85638_sctg_516_50582', 1, 3.756, 516, 50582]]

        >>> len(marker_features)
        16

        >>> collapsed_features = collapse_degenerate_markers(marker_features)
        >>> len(collapsed_features)
        5

    The degenerate features (identical linkage group, genetic distance and
    original scaffold) are collapsed into a region:

        >>> collapsed_features[0]
        [1, 31842, 94039, 207]

    The format is [linkage group, start, end, original scaffold].

    If a singleton (non-degenerate) feature is found, the region is simply
    a single point in the genome:

        >>> collapsed_features[1]
        [1, 116303, 116303, 207]

    so 'start' and 'end' are identical.

    Two markers are not considered degenerate if they belong to different
    original scaffolds, even if they are in terms of genetic linkage:

        >>> collapsed_features[2]
        [1, 158925, 258658, 207]
        >>> collapsed_features[3:]
        [[1, 41893, 41893, 486], [1, 36614, 50582, 516]]
    """

    def degeneracy(linkage_record):
        linkage_group, genetic_distance, scaffold = (
            linkage_record[1],
            linkage_record[2],
            linkage_record[3],
        )
        return (linkage_group, genetic_distance, scaffold)

    degenerate_records = []
    for _, degenerate_group in itertools.groupby(
        linkage_records, key=degeneracy
    ):
        group_list = list(degenerate_group)
        start_record, end_record = group_list[0], group_list[-1]

        assert (start_record[1], start_record[2], start_record[3]) == (
            end_record[1],
            end_record[2],
            end_record[3],
        )

        start_position = start_record[-1]
        end_position = end_record[-1]

        scaffold = start_record[3]
        linkage_group = start_record[1]
        record = [linkage_group, start_position, end_position, scaffold]

        degenerate_records.append(record)

    return degenerate_records


[docs]def write_bed(records, output_file):
    with open(output_file, "w") as bed_handle:
        for record in records:
            line = "\t".join((str(field) for field in record))
            bed_handle.write("{}\n".format(line))


[docs]def linkage_group_ordering(linkage_records):
    """Convert degenerate linkage records into ordered info_frags-like records
    for comparison purposes.

    Simple example:

        >>> linkage_records = [
        ... ['linkage_group_1', 31842, 94039, 'sctg_207'],
        ... ['linkage_group_1', 95303, 95303, 'sctg_207'],
        ... ['linkage_group_2', 15892, 25865, 'sctg_308'],
        ... ['linkage_group_2', 41893, 41893, 'sctg_486'],
        ... ['linkage_group_3', 36614, 50582, 'sctg_516'],
        ... ]
        >>> ordering = linkage_group_ordering(linkage_records)

    Each key of the record is a newly-formed 'scaffold' (linkage group):

        >>> sorted(ordering.keys())
        ['linkage_group_1', 'linkage_group_2', 'linkage_group_3']

    Records are in the form [init_contig, frag_id, start, end, orientation].
    Since fragment ids are meaningless in non-HiC contexts a negative
    identifier is set so it is understood that region was added due to
    linkage data (-1 is for recovering data after first-pass polishing and -2
    is for sequence insertions after long read based polishing).

        >>> ordering['linkage_group_1']
        [['sctg_207', -3, 31842, 94039, 1], ['sctg_207', -3, 95303, 95303, 1]]
        >>> ordering['linkage_group_2']
        [['sctg_308', -3, 15892, 25865, 1], ['sctg_486', -3, 41893, 41893, 1]]

    Orientations are always set to 1 by default.

        >>> ordering['linkage_group_3']
        [['sctg_516', -3, 36614, 50582, 1]]


    """

    new_records = dict()

    for lg_name, linkage_group in itertools.groupby(
        linkage_records, operator.itemgetter(0)
    ):
        new_records[lg_name] = []
        for record in linkage_group:
            init_contig = record[-1]
            start = record[1]
            end = record[2]

            new_record = [init_contig, -3, start, end, 1]

            new_records[lg_name].append(new_record)

    return new_records


[docs]def compare_orderings(info_frags_records, linkage_orderings):

    """Given linkage groups and info_frags records, link pseudo-chromosomes to
    scaffolds based on the initial contig composition of each group. Because
    info_frags records are usually richer and may contain contigs not found
    in linkage groups, those extra sequences are discarded.

    Example with two linkage groups and two chromosomes:

        >>> linkage_orderings = {
        ...     'linkage_group_1': [
        ...         ['sctg_516', -3, 36614, 50582, 1],
        ...         ['sctg_486', -3, 41893, 41893, 1],
        ...         ['sctg_486', -3, 50054, 62841, 1],
        ...         ['sctg_207', -3, 31842, 94039, 1],
        ...         ['sctg_558', -3, 51212, 54212, 1],
        ...     ],
        ...     'linkage_group_2': [
        ...         ['sctg_308', -3, 15892, 25865, 1],
        ...         ['sctg_842', -3, 0, 8974, 1],
        ...         ['sctg_994', -3, 0, 81213, 1],
        ...     ],
        ... }
        >>> info_frags = {
        ...     'scaffold_A': [
        ...         ['sctg_308', 996, 15892, 25865, 1],
        ...         ['sctg_778', 1210, 45040, 78112, -1],
        ...         ['sctg_842', 124, 0, 8974, 1],
        ...     ],
        ...     'scaffold_B': [
        ...         ['sctg_516', 5, 0, 38000, 1],
        ...         ['sctg_486', 47, 42050, 49000, 1],
        ...         ['sctg_1755', 878, 95001, 10844, -1],
        ...         ['sctg_842', 126, 19000, 26084, 1],
        ...         ['sctg_207', 705, 45500, 87056, 1],
        ...     ],
        ...     'scaffold_C': [
        ...        ['sctg_558', 745, 50045, 67851, 1],
        ...        ['sctg_994', 12, 74201, 86010, -1],
        ...     ],
        ... }
        >>> matching_pairs = compare_orderings(info_frags, linkage_orderings)
        >>> matching_pairs['scaffold_B']
        (3, 'linkage_group_1', {'sctg_558': 'sctg_207'})
        >>> matching_pairs['scaffold_A']
        (2, 'linkage_group_2', {'sctg_994': 'sctg_842'})

    """

    scaffolds = info_frags_records.keys()
    linkage_groups = linkage_orderings.keys()

    best_matching_table = dict()

    for scaffold, linkage_group in itertools.product(
        scaffolds, linkage_groups
    ):
        lg_ordering = [
            init_contig
            for init_contig, _ in itertools.groupby(
                linkage_orderings[linkage_group], operator.itemgetter(0)
            )
        ]
        scaffold_ordering = [
            init_contig
            for init_contig, bin_group in itertools.groupby(
                info_frags_records[scaffold], operator.itemgetter(0)
            )
            if init_contig in lg_ordering
        ]

        overlap = set(lg_ordering).intersection(set(scaffold_ordering))
        missing_locations = dict()
        for missing_block in sorted(set(lg_ordering) - set(overlap)):
            for i, init_contig in enumerate(lg_ordering):
                if init_contig == missing_block:
                    try:
                        block_before = lg_ordering[i - 1]
                    except IndexError:
                        block_before = "beginning"
                    missing_locations[missing_block] = block_before

        try:
            if len(overlap) > best_matching_table[scaffold][0]:
                best_matching_table[scaffold] = (
                    len(overlap),
                    linkage_group,
                    missing_locations,
                )

        except KeyError:
            best_matching_table[scaffold] = (
                len(overlap),
                linkage_group,
                missing_locations,
            )

    return best_matching_table


[docs]def get_missing_blocks(info_frags_records, matching_pairs, linkage_orderings):
    """Get missing blocks in a scaffold based on the genetic map order.

    Given matching scaffold blocks/genetic map blocks (based on restriction 
    sites and SNP markers, respectively), move around the scaffold blocks
    such that they map the genetic map order.

    Parameters
    ----------
    info_frags_records : dict
        A dictionary representing the scaffolds and their block order as
        described in an info_frags.txt file
    matching_pairs : dict
        A list of best matching pairs in the form 
        (scaffold_block, linkage group)
    linkage_orderings : dict
        A dictionary representing the genetic map and the linkage groups
        as described in csv file.        

    Example
    -------

        >>> linkage_orderings = {
        ...     'linkage_group_1': [
        ...         ['sctg_516', -3, 36614, 50582, 1],
        ...         ['sctg_486', -3, 41893, 41893, 1],
        ...         ['sctg_486', -3, 50054, 62841, 1],
        ...         ['sctg_207', -3, 31842, 94039, 1],
        ...         ['sctg_558', -3, 51212, 54212, 1],
        ...     ],
        ...     'linkage_group_2': [
        ...         ['sctg_308', -3, 15892, 25865, 1],
        ...         ['sctg_842', -3, 0, 8974, 1],
        ...         ['sctg_994', -3, 0, 81213, 1],
        ...     ],
        ... }
        >>> info_frags = {
        ...     'scaffold_A': [
        ...         ['sctg_308', 996, 15892, 25865, 1],
        ...         ['sctg_778', 1210, 45040, 78112, -1],
        ...         ['sctg_842', 124, 0, 8974, 1],
        ...     ],
        ...     'scaffold_B': [
        ...         ['sctg_516', 5, 0, 38000, 1],
        ...         ['sctg_486', 47, 42050, 49000, 1],
        ...         ['sctg_1755', 878, 95001, 10844, -1],
        ...         ['sctg_842', 126, 19000, 26084, 1],
        ...         ['sctg_207', 705, 45500, 87056, 1],
        ...     ],
        ...     'scaffold_C': [
        ...        ['sctg_558', 745, 50045, 67851, 1],
        ...        ['sctg_994', 12, 74201, 86010, -1],
        ...     ],
        ... }
        >>> matching_pairs = compare_orderings(info_frags, linkage_orderings)
        >>> new_records = get_missing_blocks(info_frags, matching_pairs,
        ...                                  linkage_orderings)
        >>> for my_bin in new_records['scaffold_A']:
        ...     print(list(my_bin))
        ...
        ['sctg_308', 996, 15892, 25865, 1]
        ['sctg_778', 1210, 45040, 78112, -1]
        ['sctg_842', 124, 0, 8974, 1]
        ['sctg_842', 126, 19000, 26084, 1]
        ['sctg_994', 12, 74201, 86010, -1]

        >>> for my_bin in new_records['scaffold_B']:
        ...     print(list(my_bin))
        ...
        ['sctg_516', 5, 0, 38000, 1]
        ['sctg_486', 47, 42050, 49000, 1]
        ['sctg_1755', 878, 95001, 10844, -1]
        ['sctg_207', 705, 45500, 87056, 1]
        ['sctg_558', 745, 50045, 67851, 1]

    """

    import logging
    import sys

    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    touched_lgs = set()

    def record_length(key_record_tuple):
        return len(key_record_tuple[1])

    new_scaffolds = copy.deepcopy(info_frags_records)

    for scaffold_name in collections.OrderedDict(
        sorted(new_scaffolds.items(), key=record_length, reverse=True)
    ):
        scaffold = new_scaffolds[scaffold_name]
        new_scaffold = []
        corresponding_lg = matching_pairs[scaffold_name][1]
        if corresponding_lg in touched_lgs:
            continue
        else:
            touched_lgs.add(corresponding_lg)
        scaffold_block_names = {my_bin[0] for my_bin in scaffold}
        lg_block_names = [
            my_block[0] for my_block in linkage_orderings[corresponding_lg]
        ]

        touched_bins = set()

        for lg_block_name in lg_block_names:
            if lg_block_name in scaffold_block_names:
                for my_bin in scaffold:
                    if tuple(my_bin) in new_scaffold:
                        continue
                    elif (
                        my_bin[0] == lg_block_name
                        or my_bin[0] not in lg_block_names
                    ):
                        new_scaffold.append(tuple(my_bin))
                        touched_bins.add(tuple(my_bin))
                    else:
                        break
            for other_name, other_scaffold in new_scaffolds.items():
                if other_name == scaffold_name:
                    continue
                i = 0
                for my_bin in other_scaffold:
                    if tuple(my_bin) in new_scaffold:
                        i += 1
                        continue
                    elif my_bin[0] == lg_block_name:
                        moved_bin = tuple(other_scaffold.pop(i))
                        new_scaffold.append(tuple(moved_bin))
                        touched_bins.add(tuple(moved_bin))
                        i -= 1
                    i += 1

        for remaining_bin in scaffold:
            if tuple(remaining_bin) not in touched_bins:
                new_scaffold.append(tuple(remaining_bin))
                touched_bins.add(tuple(remaining_bin))

        if len(new_scaffold) > 0:
            new_scaffolds[scaffold_name] = new_scaffold

    return new_scaffolds


[docs]def main():
    arguments = docopt.docopt(__doc__, version=VERSION_NUMBER)

    linkage_file = arguments["<linkage.csv>"]
    info_frags_file = arguments["<info_frags.txt>"]
    output = arguments["<output_file>"]
    fasta = arguments["--fasta"]

    output_fasta = "{}_linkage.fa".format(output)

    linkage_csv = parse_linkage_csv(linkage_file)
    info_frags_records = parse_info_frags.parse_info_frags(info_frags_file)
    collapsed_linkage = collapse_degenerate_markers(linkage_csv)
    group_ordering = linkage_group_ordering(collapsed_linkage)
    matching_table = compare_orderings(info_frags_records, group_ordering)
    new_scaffolds = get_missing_blocks(
        info_frags_records, matching_table, group_ordering
    )

    parse_info_frags.write_info_frags(new_scaffolds, output=output)

    if fasta:
        parse_info_frags.write_fasta(fasta, new_scaffolds, output=output_fasta)


if __name__ == "__main__":
    main()