/** * Fix the AD for the GenotypesContext of a VariantContext that has been subset * * @param originalGs the original GenotypesContext * @param originalVC the original VariantContext * @param allelesToUse the new (sub)set of alleles to use * @return a new non-null GenotypesContext */ public static GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List<Allele> allelesToUse) { if (originalGs == null) throw new IllegalArgumentException("the original Gs cannot be null"); if (originalVC == null) throw new IllegalArgumentException("the original VC cannot be null"); if (allelesToUse == null) throw new IllegalArgumentException("the alleles to use list cannot be null"); // the bitset representing the allele indexes we want to keep final BitSet alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); // the new genotypes to create final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); // the samples final List<String> sampleIndices = originalGs.getSampleNamesOrderedByName(); // create the new genotypes for ( int k = 0; k < originalGs.size(); k++ ) { final Genotype g = originalGs.get(sampleIndices.get(k)); newGTs.add(fixAD(g, alleleIndexesToUse)); } return newGTs; }
@Override public boolean isEmpty() { // optimization -- we know the number of samples in the unparsed data, so use it here to // avoid parsing just to know if the genotypes context is empty return loaded ? super.isEmpty() : nUnparsedGenotypes == 0; }
/** * Iterate over the Genotypes in this context in their sample name order (A, B, C) * regardless of the underlying order in the vector of genotypes * @return a Iterable over the genotypes in this context. */ public Iterable<Genotype> iterateInSampleNameOrder() { return iterateInSampleNameOrder(getSampleNamesOrderedByName()); }
/** * Create a freshly allocated GenotypeContext containing the genotypes in toCopy * * @param toCopy the GenotypesContext to copy * @return an mutable GenotypeContext containing genotypes */ public static final GenotypesContext copy(final GenotypesContext toCopy) { return create(new ArrayList<Genotype>(toCopy.getGenotypes())); }
private final void testGenotypesContextContainsExpectedSamples(GenotypesContext gc, List<Genotype> expectedSamples) { Assert.assertEquals(gc.isEmpty(), expectedSamples.isEmpty()); Assert.assertEquals(gc.size(), expectedSamples.size()); Assert.assertEquals(gc.get(i), expectedSamples.get(i)); Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); Assert.assertTrue(gc.containsSample(name)); Assert.assertFalse(gc.containsSample(MISSING.getSampleName())); testIterable(gc.iterateInSampleNameOrder(), genotypeNames); testIterable(gc, genotypeNames); testIterable(gc.iterateInSampleNameOrder(genotypeNames), genotypeNames); if ( ! genotypeNames.isEmpty() ) { Set<String> first = Collections.singleton(genotypeNames.iterator().next()); testIterable(gc.iterateInSampleNameOrder(first), first); assertEqualsSet(gc.getSampleNames(), genotypeNames, "gc sample names vs. expected sample names"); Assert.assertTrue(ParsingUtils.isSorted(gc.getSampleNamesOrderedByName())); Assert.assertTrue(ParsingUtils.isSorted(gc.iterateInSampleNameOrder())); Assert.assertTrue(gc.containsSamples(genotypeNames)); Assert.assertFalse(gc.containsSamples(withMissing));
public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set<String> allowedAttributes) { if ( allowedAttributes == null ) return vc; final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { final Map<String, Object> attrs = new HashMap<>(); for ( final Map.Entry<String, Object> attr : genotype.getExtendedAttributes().entrySet() ) { if ( allowedAttributes.contains(attr.getKey()) ) attrs.put(attr.getKey(), attr.getValue()); } newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); } return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); }
@Override public List<VariantContext> read(int batchSize) { List<VariantContext> variantContexts = new ArrayList<>(batchSize); while (lineIterator.hasNext() && variantContexts.size() < batchSize) { String line = lineIterator.next(); if (line.startsWith("#") || line.trim().isEmpty()) { continue; } try { VariantContext variantContext = codec.decode(line); // Lazy processing management if (!lazy && variantContext.getGenotypes().isLazyWithData()) { variantContext.getGenotype(variantContext.getGenotypes().size() - 1); } variantContext.getGenotypes(); variantContexts.add(variantContext); } catch (TribbleException e) { if (e.getMessage().startsWith("The provided VCF file is malformed at approximately line number")) { logMalformatedLine(line, e); } else { throw e; } } } return variantContexts; }
private static Genotype getGenotypeFromArray(RefMetaDataTracker tracker, RodBinding<VariantContext> genotypes, boolean verifiedSampleName, boolean verifySample, String sampleName) { // get the truthForSample and the hapmap information for this site; if either are null we can't move forward Collection<VariantContext> truths = tracker.getValues(genotypes); if (truths == null || truths.isEmpty()) return null; VariantContext truthForSample = truths.iterator().next(); // verify that the sample name exists in the input genotype file if (!verifiedSampleName && verifySample) { if (!truthForSample.getSampleNames().contains(sampleName)) throw new UserException.BadInput("The sample name was set to " + sampleName + " but this sample isn't in your genotypes file. Please Verify your sample name"); verifiedSampleName = true; } GenotypesContext gt = truthForSample.getGenotypes(); // if we are supposed to verify the sample name, AND the sample doesn't exist in the genotypes -- skip this site if (verifySample && !gt.containsSample(sampleName)) return null; // if the sample doesn't exist in genotypes AND there is more than one sample in the genotypes file -- skip this site if (!gt.containsSample(sampleName) && gt.size() != 1) return null; // if there is more than one sample in the genotypes file, get it by name. Otherwise just get the sole sample genotype return gt.size() != 1 ? gt.get(sampleName) : gt.get(0); }
protected void ensureSampleNameMap() { if ( sampleNameToOffset == null ) { sampleNameToOffset = new HashMap<String, Integer>(size()); for ( int i = 0; i < size(); i++ ) { sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i); } } }
@Override public int size() { // optimization -- we know the number of samples in the unparsed data, so use it here to // avoid parsing just to know the size of the context return loaded ? super.size() : nUnparsedGenotypes; }
@Test(enabled = true, dataProvider = "GenotypesContextProvider") public void testReplace(GenotypesContextProvider cfg) { int n = cfg.makeContext().size(); for ( int i = 0; i < n; i++ ) { GenotypesContext gc = cfg.makeContext(); Genotype toReplace = gc.get(i); Genotype replacement = GenotypeBuilder.create(toReplace.getSampleName(), Arrays.asList(Aref, Aref)); gc.replace(replacement); ArrayList<Genotype> l = new ArrayList<Genotype>(cfg.initialSamples); l.set(i, replacement); Assert.assertEquals(replacement, gc.get(i)); testGenotypesContextContainsExpectedSamples(gc, l); } }
/** * Subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * * @param vc variant context with genotype likelihoods * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** * @param assignGenotypes assignment strategy for the (subsetted) PLs * @return a new non-null GenotypesContext with subsetted alleles */ public static GenotypesContext subsetAlleles(final VariantContext vc, final List<Allele> allelesToUse, final GenotypeAssignmentMethod assignGenotypes) { if ( vc == null ) throw new IllegalArgumentException("the VariantContext cannot be null"); if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); if ( allelesToUse.isEmpty() ) throw new IllegalArgumentException("must have alleles to use"); if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); // optimization: if no input genotypes, just exit if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); // find the likelihoods indexes to use from the used alternate alleles final List<List<Integer>> likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); // find the strand allele count indexes to use from the used alternate alleles final List<Integer> sacIndexesToUse = determineSACIndexesToUse(vc, allelesToUse); // create the new genotypes return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, sacIndexesToUse, assignGenotypes); }
/** * Create a GenotypesContext containing the genotypes in iteration order contained * in toCopy * * @param toCopy the collection of genotypes * @return an mutable GenotypeContext containing genotypes */ public static final GenotypesContext copy(final Collection<Genotype> toCopy) { return toCopy == null ? NO_GENOTYPES : create(new ArrayList<Genotype>(toCopy)); }
public Genotype getGenotype(int ith) { return genotypes.get(ith); }
/** * Unpack GenotypesContext into arraylist of double values * @param GLs Input genotype context * @param keepUninformative Don't filter out uninformative genotype likelihoods (i.e. all log likelihoods near 0) * This is useful for VariantContexts with a NON_REF allele * @return ArrayList of doubles corresponding to GL vectors */ protected static ArrayList<double[]> getGLs(final GenotypesContext GLs, final boolean includeDummy, final boolean keepUninformative) { final ArrayList<double[]> genotypeLikelihoods = new ArrayList<>(GLs.size() + 1); if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { if ( sample.hasLikelihoods() ) { final double[] gls = sample.getLikelihoods().getAsVector(); if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL || keepUninformative ) genotypeLikelihoods.add(gls); } } return genotypeLikelihoods; }
private final void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) { final GenotypesContext gc = new GenotypesContext(); for ( final Genotype g : getGenotypes() ) { gc.add(fullyDecodeGenotypes(g, header)); } builder.genotypesNoValidation(gc); }
public static void assertEquals(final GenotypesContext actual, final GenotypesContext expected) { if (expected == null) { Assert.assertNull(actual); return; } Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "Sample names differ"); for (final String name : expected.getSampleNamesOrderedByName()) { Assert.assertEquals(actual.get(name).getAlleles(), expected.get(name).getAlleles(), "Alleles differ for sample " + name); Assert.assertEquals(actual.get(name).getAD(), expected.get(name).getAD()); Assert.assertEquals(actual.get(name).getPL(), expected.get(name).getPL()); } }