Source code for dinf.feature_extractor

from __future__ import annotations
import collections
from typing import Callable, Dict, Mapping, Tuple

import numpy as np
import numpy.typing as npt
import tskit

from .vcf import BagOfVcf
from .misc import ts_ploidy_of_individuals, ts_nodes_of_individuals


class _FeatureMatrix:
    """
    Common functionality of HaplotypeMatrix and BinnedHaplotypeMatrix
    """

    def __init__(
        self,
        *,
        num_individuals: int,
        num_loci: int,
        ploidy: int,
        phased: bool,
        maf_thresh: float | None = None,
    ):
        """
        :param num_individuals:
            The number of individuals to include in the feature matrix.
        :param num_loci:
            Dimensionality along the sequence length. This might be the number
            of snps, or the number of bins into which the sequence is partitioned.
        :param ploidy:
            Ploidy of the individuals.
        :param phased:
            If True, the individuals' haplotypes will each be included as
            independent rows in the feature matrix and the shape of the
            feature matrix will be ``(ploidy * num_individuals, num_loci, c)``.
            If False, the allele counts for each individual will be summed
            across their chromosome copies and the shape of the feature matrix
            will be ``(num_individuals, num_loci, c)``.
        :param maf_thresh:
            Minor allele frequency (MAF) threshold. Sites with MAF lower than
            this value are ignored. If None, only invariant sites will be excluded.
        """
        if num_individuals < 1:
            raise ValueError("must have num_individuals >= 1")
        if num_loci < 1:
            raise ValueError("must have num_loci >= 1")
        self._num_individuals = num_individuals
        self._num_loci = num_loci
        self._phased = phased
        self._ploidy = ploidy
        self._num_haplotypes = num_individuals * ploidy

        if maf_thresh is None:
            # Exclude invariant sites.
            self._allele_count_threshold = 1.0
        else:
            if maf_thresh < 0 or maf_thresh > 1:
                raise ValueError("must have 0 <= maf_thresh <= 1")
            self._allele_count_threshold = maf_thresh * self._num_haplotypes

        # The number of rows in the feature matrix is determined by
        # whether or not the data should be treated as phased.
        # If the data are not phased, we sum genotypes across the
        # chromosome copies. We call the rows "pseudo haplotypes",
        # regardless of the phasing.
        if phased:
            self._num_pseudo_haplotypes = self._num_haplotypes
        else:
            self._num_pseudo_haplotypes = num_individuals

        if self._num_pseudo_haplotypes < 2:
            raise ValueError("must have at least two pseudo-haplotypes")

    @property
    def shape(self):
        """Shape of the feature matrix."""
        raise NotImplementedError

    def _from_genotype_matrix(
        self,
        G: np.ndarray,
        *,
        positions: np.ndarray,
        sequence_length: int,
    ) -> np.ndarray:
        """
        Create a feature matrix from a regular genotype matrix.

        Missing genotypes in the input are assumed to take the value -1,
        and the first allele has genotype 0, second allele genotype 1, etc.

        :param G:
            Genotype matrix with shape (num_sites, num_individuals, ploidy).
            The genotypes may be phased or unphased.
        :param positions:
            Vector of variant positions.
        :param sequence_length:
            The length of the sequence from which the matrix is derived.
        :return:
            Array with shape ``(num_pseudo_haplotypes, num_loci, c)``.
        """
        raise NotImplementedError

    def from_ts(self, ts: tskit.TreeSequence) -> np.ndarray:
        """
        Create a feature matrix from a :ref:`tskit <tskit:sec_introduction>`
        tree sequence.

        :param ts: The tree sequence.
        :return:
            Array with shape ``(num_pseudo_haplotypes, num_loci, c)``.
        """
        if ts.num_samples != self._num_haplotypes:
            raise ValueError(
                f"Expected {self._num_haplotypes} haplotypes, "
                f"but ts.num_samples == {ts.num_samples}."
            )
        if ts.sequence_length < self._num_loci:
            raise ValueError("Sequence length is shorter than the number of loci")

        G = ts.genotype_matrix()  # shape is (num_sites, num_haplotypes)
        G = np.reshape(G, (G.shape[0], self._num_individuals, self._ploidy))
        positions = np.array(ts.tables.sites.position)
        return self._from_genotype_matrix(
            G, positions=positions, sequence_length=ts.sequence_length
        )

    def from_vcf(
        self,
        vb: BagOfVcf,
        *,
        sequence_length: int,
        max_missing_genotypes: int,
        min_seg_sites: int,
        rng: np.random.Generator,
    ) -> np.ndarray:
        """
        Create a feature matrix from a region of a VCF/BCF.

        The genomic window for a feature matrix is drawn uniformly at random
        from the contigs defined in the given :class:`BagOfVcf`, ``vb``.

        Individuals in the VCFs are sampled (without replacement) for
        inclusion in the output matrix. The size of the feature space
        can therefore be vastly increased by having more individuals
        in the VCFs than are needed for the feature dimensions.

        :param vb:
            The BagOfVcf object that describes the VCF/BCF files.
        :param sequence_length:
            Length of the genomic window to be sampled.
        :param max_missing_genotypes:
            Consider only sites with at most this many missing genotype calls.
        :param min_seg_sites:
            Sampled genotype matrix must have at least this many variable
            sites (after filtering sites for missingness).
        :param numpy.random.Generator rng:
            Numpy random number generator.
        :return:
            Array with shape ``(num_pseudo_haplotypes, num_loci, c)``.
        """
        G, positions = vb.sample_genotype_matrix(
            sequence_length=sequence_length,
            max_missing_genotypes=max_missing_genotypes,
            min_seg_sites=min_seg_sites,
            require_phased=self._phased,
            rng=rng,
        )

        G_individuals = G.shape[1]
        if G_individuals < self._num_individuals:
            raise ValueError(
                f"Expected at least {self._num_individuals} individuals in "
                f"the vcf bag, but only found {G_individuals}."
            )

        # Subsample individuals.
        idx = rng.choice(G_individuals, size=self._num_individuals, replace=False)
        G = G[:, idx, :]
        if np.any(G == -2):
            raise ValueError("Mismatched ploidy among individuals.")

        return self._from_genotype_matrix(
            G, positions=positions, sequence_length=sequence_length
        )


[docs]class HaplotypeMatrix(_FeatureMatrix): """ A factory for feature matrices consisting of haplotypes and relative positions. The feature is an :math:`n \\times m \\times c` array, where the channel dimension :math:`c` is 2. The first channel is a haplotype matrix and the second channel is a matrix of relative SNP positions. The haplotype matrix is an :math:`n \\times m` matrix, where :math:`n` corresponds to the number of haplotypes (or number of individuals, for unphased data) and :math:`m` corresponds to the number of SNPs along the sequence. For phased data, each entry is a 0 or 1 corresponding to the major or minor allele respectively. For unphased data, each entry is the count of minor alleles across all chromosomes in the individual. Only polymorphic SNPs are considered, and for multiallelic sites, only the first two alleles are used. Alleles are polarised by choosing the most frequent allele to be encoded as 0 (the major allele), and the second most frequent allele as 1 (the minor allele). The position matrix is an :math:`n \\times m` matrix, where the vector of :math:`m` inter-SNP distances are repeated :math:`n` times---once for each haplotype (or each individual, for unphased data). Each entry is the distance from the previous SNP (as a proportion of the sequence length). The first inter-SNP distance in the vector is always zero. | Chan et al. 2018, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7687905/ | Wang et al. 2021, https://doi.org/10.1111/1755-0998.13386 """ def __init__( self, *, num_individuals: int, num_loci: int, ploidy: int, phased: bool, maf_thresh: float | None = None, ): """ :param num_individuals: The number of individuals to include in the feature matrix. :param num_loci: The number of SNP sites to extract. The central ``num_loci`` SNPs in the sequence will be used. If there are fewer than ``num_loci`` SNPs, the feature matrix will be padded on both sides with zeros. :param ploidy: Ploidy of the individuals. :param maf_thresh: Minor allele frequency (MAF) threshold. Sites with MAF lower than this value are ignored. If None, only invariant sites will be excluded. :param phased: If True, the individuals' haplotypes will each be included as independent rows in the feature matrix and the shape of the feature matrix will be ``(ploidy * num_individuals, num_loci, c)``. If False, the allele counts for each individual will be summed across their chromosome copies and the shape of the feature matrix will be ``(num_individuals, num_loci, c)``. """ super().__init__( num_individuals=num_individuals, num_loci=num_loci, ploidy=ploidy, maf_thresh=maf_thresh, phased=phased, ) self._dtype = np.float32 @property def shape(self) -> Tuple[int, ...]: """Shape of the feature matrix.""" return (self._num_pseudo_haplotypes, self._num_loci, 2) def _from_genotype_matrix( self, G: np.ndarray, *, positions: np.ndarray, sequence_length: int, ) -> np.ndarray: """ Create feature matrix from a regular genotype matrix. Missing genotypes in the input are assumed to take the value -1, and the first allele has genotype 0, second allele genotype 1, etc. We consider only allele 0 and 1. For multiallelic sites this means all but the first two alleles are ignored. :param G: Genotype matrix with shape (num_sites, num_individuals, ploidy). The genotypes must be phased or unphased. :param positions: Vector of variant positions. :param sequence_length: Unused. :return: Array with shape ``(num_haplotypes, num_loci, 2)``. For a matrix :math:`M`, the :math:`M[i][j][0]`'th entry is the genotype of haplotype :math:`i` at the :math:`j`'th site. The :math:`M[i][j][1]`'th entry is the number of basepairs between sites :math:`j` and :math:`j-1`. """ assert len(G) == len(positions) G_sites, G_individuals, G_ploidy = G.shape assert G_individuals == self._num_individuals assert G_ploidy == self._ploidy G = np.reshape(G, (G_sites, G_individuals * G_ploidy)) # Filter sites with insufficient minor allele count. ac0 = np.sum(G == 0, axis=1) ac1 = np.sum(G == 1, axis=1) keep = np.minimum(ac0, ac1) >= self._allele_count_threshold G = G[keep] positions = positions[keep] G, positions = self._get_fixed_num_snps( G, positions=positions, num_snps=self._num_loci ) # Identify genotypes that aren't 0 or 1. These could be missing # (encoded as -1), or high-numbered alleles at multiallelic sites. missing = np.logical_or(G == -1, G >= 2) ac0 = np.sum(G == 0, axis=1) ac1 = np.sum(G == 1, axis=1) # Polarise 0 and 1 in genotype matrix by major allele frequency. flip = ac1 > ac0 G ^= np.expand_dims(flip, -1) # Treat missing genotypes as the majority allele. G[missing] = 0 if not self._phased and self._ploidy > 1: # Collapse each individual's chromosome copies by summing counts. G = np.reshape(G, (G.shape[0], self._num_individuals, self._ploidy)) G = np.sum(G, axis=2) positions = np.around(positions) delta_positions = np.diff(positions, prepend=positions[0]) delta_positions = delta_positions.astype(self._dtype) / sequence_length M = np.zeros(self.shape, dtype=self._dtype) M[..., 0] = G.T M[..., 1] = np.tile(delta_positions, [self._num_pseudo_haplotypes, 1]) return M def _get_fixed_num_snps(self, G: np.ndarray, *, positions: np.ndarray, num_snps): """ Trim or pad the genotype matrix and positions to be of fixed size. """ assert len(G) == len(positions) G_sites, G_haplotypes = G.shape delta = G_sites - num_snps if delta >= 0: # trim left = delta // 2 right = left + num_snps G = G[left:right] positions = positions[left:right] else: # pad pad_left = -delta // 2 pad_right = num_snps - G_sites - pad_left G_left = np.zeros((pad_left, G_haplotypes), dtype=G.dtype) G_right = np.zeros((pad_right, G_haplotypes), dtype=G.dtype) G = np.concatenate((G_left, G, G_right)) positions_left = np.zeros(pad_left, dtype=positions.dtype) right_pad_value = 0 if len(positions) == 0 else positions[-1] positions_right = np.full(pad_right, right_pad_value, dtype=positions.dtype) positions = np.concatenate((positions_left, positions, positions_right)) return G, positions
[docs]class BinnedHaplotypeMatrix(_FeatureMatrix): """ A factory for feature matrices of pseudo-haplotypes. The binned haplotype matrix is an :math:`n \\times m` matrix, where :math:`n` corresponds to the number of haplotypes (or polyploid genotypes, for unphased data) and :math:`m` corresponds to a set of equally sized bins along the sequence length. Each matrix entry contains the count of minor alleles in an individual's haplotype (or polyploid genotype) in a given bin. Only polymorphic SNPs are considered, and for multiallelic sites, only the first two alleles are used. Alleles are polarised by choosing the most frequent allele to be encoded as 0, and the second most frequent allele as 1. As the features are intended to be passed to a covolutional neural network, the output dimensions are actually :math:`n \\times m \\times 1`, where the final dimension is the (unused) "channels" dimension for the convolution. Gower et al. 2021, https://doi.org/10.7554/eLife.64669 """ def __init__( self, *, num_individuals: int, num_loci: int, ploidy: int, phased: bool, maf_thresh: float | None = None, ): """ :param num_individuals: The number of individuals to include in the feature matrix. :param num_loci: The number of bins into which the sequence is partitioned. Each bin spans ``sequence_length / num_loci`` base pairs. :param ploidy: Ploidy of the individuals. :param phased: If True, the individuals' haplotypes will each be included as independent rows in the feature matrix and the shape of the feature matrix will be ``(ploidy * num_individuals, num_loci, 1)``. If False, the allele counts for each individual will be summed across their chromosome copies and the shape of the feature matrix will be ``(num_individuals, num_loci, 1)``. :param maf_thresh: Minor allele frequency (MAF) threshold. Sites with MAF lower than this value are ignored. If None, only invariant sites will be excluded. """ super().__init__( num_individuals=num_individuals, num_loci=num_loci, ploidy=ploidy, maf_thresh=maf_thresh, phased=phased, ) # The numpy data type of the feature matrix. To save memory we use # an np.int8, which assumes that counts are small. # However, counts may overflow int8 for very large values of # mu * Ne * num_individuals * sequence_length / num_loci, # Probably num_loci could be increased in such cases. # TODO: print a warning if overflow occurs during conversion. self._dtype = np.int8 @property def shape(self) -> Tuple[int, ...]: """Shape of the feature matrix.""" return (self._num_pseudo_haplotypes, self._num_loci, 1) def _from_genotype_matrix( self, G: np.ndarray, *, positions: np.ndarray, sequence_length: int, ) -> np.ndarray: """ Create a pseudo-genotype matrix from a regular genotype matrix. Missing genotypes in the input are assumed to take the value -1, and the first allele has genotype 0, second allele genotype 1, etc. We consider only allele 0 and 1. For multiallelic sites this means all but the first two alleles are ignored. :param G: Genotype matrix with shape (num_sites, num_individuals, ploidy). The genotypes may be phased or unphased. :param positions: Vector of variant positions. :param sequence_length: The length of the sequence from which the matrix is derived. :return: Array with shape ``(num_pseudo_haplotypes, num_loci, 1)``. For a matrix :math:`M`, the :math:`M[i][j][0]`'th entry is the count of minor alleles in the :math:`j`'th bin of psdeudo-haplotype :math:`i`. """ assert len(G) == len(positions) M = np.zeros(self.shape, dtype=self._dtype) if len(positions) == 0: return M bins = np.floor_divide(positions * self._num_loci, sequence_length).astype( np.int32 ) G_sites, G_individuals, G_ploidy = G.shape assert G_individuals == self._num_individuals assert G_ploidy == self._ploidy G = np.reshape(G, (G_sites, G_individuals * G_ploidy)) # Identify genotypes that aren't 0 or 1. These will be ignored later. missing = np.logical_or(G == -1, G >= 2) ac0 = np.sum(G == 0, axis=1) ac1 = np.sum(G == 1, axis=1) keep = np.minimum(ac0, ac1) >= self._allele_count_threshold # Polarise 0 and 1 in genotype matrix by major allele frequency. flip = ac1 > ac0 G ^= np.expand_dims(flip, -1) # Exclude missing genotypes from the sums below. G[missing] = 0 if not self._phased: # Collapse each individual's chromosome copies by summing counts. G = np.reshape(G, (G_sites, self._num_individuals, self._ploidy)) G = np.sum(G, axis=2) for j, genotypes in zip(bins[keep], G[keep]): M[:, j, 0] += genotypes return M
class _MultipleFeatureMatrices: """ A factory for labelled collections of :class:`_FeatureMatrix` objects. One feature matrix is produced for each label. Labels correspond to collections of individuals that will be treated as exchangeable (e.g. populations). """ _feature_matrix_cls: Callable def __init__( self, *, num_individuals: Mapping[str, int], num_loci: Mapping[str, int], ploidy: Mapping[str, int], # TODO: label-specific option for maf_thresh # maf_thresh: Mapping[str, float] | None = None, global_maf_thresh: float | None = None, # TODO: label-specific option for phased. # phased: Mapping[str, bool] | None = None, global_phased: bool, ): """ :param num_individuals: A dict that maps labels to the number of individuals in the feature matrix. :param num_loci: A dict that maps labels to the number of feature loci to be extracted from the sequence. :param ploidy: A dict that maps labels to the ploidy of the individuals. :param global_maf_thresh: Minor allele frequency (MAF) threshold. Sites with MAF lower than this value are ignored. MAF is calculated across all individuals. If None, only invariant sites will be excluded. :param global_phased: If True, the individuals' haplotypes will each be included as independent rows in each feature matrix and the shape of the feature matrix for label ``l`` will be ``(ploidy[l] * num_individuals[l], num_loci[l], c)``. If False, the allele counts for each individual will be summed across their chromosome copies and the shape of the feature matrix will be ``(num_individuals[l], num_loci[l], c)``. """ dict_args = [num_individuals, num_loci, ploidy] dict_strs = "num_individuals, num_loci, ploidy" for d in dict_args: if not isinstance(d, collections.abc.Mapping): raise TypeError(f"Expected dict for each of: {dict_strs}.") keys_list = [d.keys() for d in dict_args] labels = keys_list[0] if any(labels != other for other in keys_list[1:]): raise ValueError(f"Must use the same dict keys for each of: {dict_strs}.") self.features = { label: self._feature_matrix_cls( num_individuals=num_individuals[label], num_loci=num_loci[label], ploidy=ploidy[label], maf_thresh=0, phased=global_phased, ) for label in labels } self._num_individuals = num_individuals self._num_loci = num_loci self._ploidy = ploidy self._global_phased = global_phased # We use a minimum threshold of 1 to exclude invariant sites. if global_maf_thresh is None: global_maf_thresh = 0 total_haplotypes = sum( num_individuals[label] * ploidy[label] for label in labels ) self._global_allele_count_threshold = max( 1, global_maf_thresh * total_haplotypes ) @property def shape(self) -> Dict[str, Tuple[int, ...]]: """Shape of the feature matrices.""" return {label: bhm.shape for label, bhm in self.features.items()} def from_ts( self, ts: tskit.TreeSequence, *, individuals: Mapping[str, npt.NDArray[np.integer]], ) -> Dict[str, np.ndarray]: """ Create pseudo-genotype matrices from a tree sequence. :param ts: The tree sequence. :param numpy.random.Generator rng: Numpy random number generator. :param individuals: A mapping from label to an array of individuals. :return: A dictionary mapping a label ``l`` to a feature array. Each array has shape ``(num_pseudo_haplotypes[l], num_loci[l], c)``. For an array :math:`M`, the :math:`M[i][j][0]`'th entry is the count of minor alleles in the :math:`j`'th bin of psdeudo-haplotype :math:`i`. """ if individuals.keys() != self.features.keys(): raise ValueError( f"Labels of individuals {list(individuals)} don't match " f"feature labels {list(self.features)}." ) G = ts.genotype_matrix() # shape is (num_sites, num_haplotypes) positions = np.array(ts.tables.sites.position) labelled_nodes = {} ac0 = np.zeros(len(positions)) ac1 = np.zeros(len(positions)) for label, l_individuals in individuals.items(): if ts.sequence_length < self._num_loci[label]: raise ValueError( f"{label}: sequence length ({ts.sequence_length}) is " f"shorter than the number of loci ({self._num_loci[label]})." ) if len(l_individuals) != self._num_individuals[label]: raise ValueError( f"{label}: expected {self._num_individuals[label]} " f"individuals, but got {len(l_individuals)}." ) ploidy = ts_ploidy_of_individuals(ts, l_individuals) if not np.all(ploidy == self._ploidy[label]): raise ValueError( f"{label}: not all individuals have ploidy == {self._ploidy[label]}" f"\n{label} ploidies: {ploidy}." ) nodes = ts_nodes_of_individuals(ts, l_individuals) labelled_nodes[label] = nodes H = G[:, labelled_nodes[label]] ac0 += np.sum(H == 0, axis=1) ac1 += np.sum(H == 1, axis=1) # Filter sites with insufficient minor allele count. keep = np.minimum(ac0, ac1) >= self._global_allele_count_threshold G = G[keep] positions = positions[keep] labelled_features = {} for label, l_individuals in individuals.items(): H = G[:, labelled_nodes[label]] H = np.reshape( H, (H.shape[0], self._num_individuals[label], self._ploidy[label]) ) labelled_features[label] = self.features[label]._from_genotype_matrix( H, positions=positions, sequence_length=ts.sequence_length ) return labelled_features def from_vcf( self, vb: BagOfVcf, *, sequence_length: int, max_missing_genotypes: int, min_seg_sites: int, rng: np.random.Generator, ) -> Dict[str, np.ndarray]: """ Create pseudo-genotype matrices from a region of a VCF/BCF. The genomic window is drawn uniformly at random from the sequences defined in the given :class:`BagOfVcf`. Individuals in the VCFs are sampled (without replacement) for inclusion in the output matrix. The size of the feature space can therefore be vastly increased by having more individuals in the VCFs than are needed for the feature dimensions. :param vb: A collection of indexed VCF/BCF files. :param sequence_length: Length of the genomic window to be sampled. :param max_missing_genotypes: Consider only sites with fewer missing genotype calls than this number. :param min_seg_sites: Sampled genotype matrix must have at least this many variable sites (after filtering sites for missingness). :param numpy.random.Generator rng: Numpy random number generator. :return: A dictionary mapping a label ``l`` to a feature array. Each array has shape ``(num_pseudo_haplotypes[l], num_loci[l], c)``. For an array :math:`M`, the :math:`M[i][j][0]`'th entry is the count of minor alleles in the :math:`j`'th bin of psdeudo-haplotype :math:`i`. """ if vb.samples is None or self.features.keys() != vb.samples.keys(): sample_labels = None if vb.samples is None else list(vb.samples) raise ValueError( f"Feature labels {list(self.features)} don't match the " f"vcf bag's sample labels: {sample_labels}." ) G, positions = vb.sample_genotype_matrix( sequence_length=sequence_length, max_missing_genotypes=max_missing_genotypes, min_seg_sites=min_seg_sites, require_phased=self._global_phased, rng=rng, ) num_samples = [len(v) for v in vb.samples.values()] offsets = np.cumsum([0] + num_samples[:-1]) labelled_indexes = {} ac0 = np.zeros(len(positions)) ac1 = np.zeros(len(positions)) for j, label in enumerate(vb.samples.keys()): if num_samples[j] < self._num_individuals[label]: raise ValueError( f"{label}: Expected at least {self._num_individuals[label]} " f"individuals in the vcf bag, but only found {num_samples[j]}." ) # Subsample individuals. idx = offsets[j] + rng.choice( num_samples[j], size=self._num_individuals[label], replace=False ) labelled_indexes[label] = idx H = G[:, idx, : self._ploidy[label]] ac0 += np.sum(H == 0, axis=(1, 2)) ac1 += np.sum(H == 1, axis=(1, 2)) # Filter sites with insufficient minor allele count. keep = np.minimum(ac0, ac1) >= self._global_allele_count_threshold G = G[keep] positions = positions[keep] labelled_features = {} for label, feature_matrix in self.features.items(): idx = labelled_indexes[label] H = G[:, idx, : self._ploidy[label]] ploidy_pad = G[:, idx, self._ploidy[label] :] if np.any(H == -2) or np.any(ploidy_pad != -2): raise ValueError(f"{label}: mismatched ploidy among individuals.") labelled_features[label] = feature_matrix._from_genotype_matrix( H, positions=positions, sequence_length=sequence_length ) return labelled_features
[docs]class MultipleHaplotypeMatrices(_MultipleFeatureMatrices): """ A factory for labelled collections of :class:`HaplotypeMatrix` objects. One feature matrix is produced for each label. Labels correspond to collections of individuals that will be treated as exchangeable (e.g. populations). """ _feature_matrix_cls = HaplotypeMatrix
[docs]class MultipleBinnedHaplotypeMatrices(_MultipleFeatureMatrices): """ A factory for labelled collections of :class:`BinnedHaplotypeMatrix` objects. One feature matrix is produced for each label. Labels correspond to collections of individuals that will be treated as exchangeable (e.g. populations). """ _feature_matrix_cls = BinnedHaplotypeMatrix