/** * Reads the VariantSource from a Variant file given an initialized VariantReader * * @param reader Initialized variant reader * @param metadata Optional metadata to fill up * @return The read variant metadata * @throws IOException if an I/O error occurs */ public static VariantFileMetadata readVariantFileMetadata(VariantReader reader, VariantFileMetadata metadata) throws IOException { Objects.requireNonNull(reader); if (metadata == null) { metadata = new VariantFileMetadata("", ""); } try { reader.open(); reader.pre(); metadata.setHeader(reader.getVariantFileMetadata().getHeader()); metadata.setSampleIds(reader.getVariantFileMetadata().getSampleIds()); metadata.setStats(reader.getVariantFileMetadata().getStats()); reader.post(); } finally { reader.close(); } return metadata; }
@Override public boolean pre() { if (metadataPath != null) { Files.exists(metadataPath); try (InputStream inputStream = FileUtils.newInputStream(metadataPath)) { // Read global JSON file and copy its info into the already available VariantSource object VariantFileMetadata readMetadata = VariantReaderUtils.readVariantFileMetadataFromJson(inputStream); fileMetadata.setId(readMetadata.getId()); fileMetadata.setPath(readMetadata.getPath()); fileMetadata.setHeader(readMetadata.getHeader()); fileMetadata.setSamplesPosition(readMetadata.getSamplesPosition()); fileMetadata.setStats(readMetadata.getStats()); } catch (IOException ex) { throw new UncheckedIOException(ex); } } if (fileMetadata != null) { Map<String, Integer> samplesPosition = fileMetadata.getSamplesPosition(); this.samplesPosition = new LinkedHashMap<>(samplesPosition.size()); String[] samples = new String[samplesPosition.size()]; for (Map.Entry<String, Integer> entry : samplesPosition.entrySet()) { samples[entry.getValue()] = entry.getKey(); } for (int i = 0; i < samples.length; i++) { this.samplesPosition.put(samples[i], i); } } return true; }
/** * Calculates VariantSetStats for a file. * @param studyId StudyId * @param fileMetadata VariantFileMetadata */ public VariantSetStatsCalculator(String studyId, VariantFileMetadata fileMetadata) { this.studyId = studyId; this.metadata = fileMetadata.toVariantStudyMetadata(studyId); files = Collections.singleton(fileMetadata.getId()); samples = new HashSet<>(fileMetadata.getSampleIds()); header = fileMetadata.getHeader(); stats = new VariantSetStats(); fileMetadata.setStats(stats); }
private synchronized void updateSamplesPosition() { if (samplesPosition == null) { List<String> samples = getSampleIds(); if (samples == null) { samplesPosition = null; } else { LinkedHashMap<String, Integer> newSamplesPosition = getSamplesPositionMap(samples); samplesPosition = newSamplesPosition; } } }
@Override public List<CS> convert(VariantFileMetadata fileMetadata) { return convert(fileMetadata.getId(), fileMetadata.getSampleIds()); }
private VariantFileMetadata checkTransformedVariants(URI variantsJson, StudyConfiguration studyConfiguration, int expectedNumVariants) throws StorageEngineException { long start = System.currentTimeMillis(); VariantFileMetadata source = new VariantFileMetadata("6", VCF_TEST_FILE_NAME); VariantReader variantReader = VariantReaderUtils.getVariantReader(Paths.get(variantsJson.getPath()), source.toVariantStudyMetadata(String.valueOf(studyConfiguration.getStudyId()))); variantReader.open(); variantReader.pre(); List<Variant> read; int numVariants = 0; while ((read = variantReader.read(100)) != null && !read.isEmpty()) { numVariants += read.size(); } variantReader.post(); variantReader.close(); if (expectedNumVariants < 0) { expectedNumVariants = source.getStats().getNumVariants(); } else { assertEquals(expectedNumVariants, source.getStats().getNumVariants()); //9792 } assertEquals(expectedNumVariants, numVariants); //9792 logger.info("checkTransformedVariants time : " + (System.currentTimeMillis() - start) / 1000.0 + "s"); return source; }
protected static VariantStudyMetadata getMetadata(Path vcfPath) { return new VariantFileMetadata(vcfPath.getFileName().toString(), "").toVariantStudyMetadata(STUDY_NAME); }
/** * Get the archive column name for a file given a VariantFileMetadata. * * @param fileMetadata VariantFileMetadata * @return Column name or Qualifier */ public static String getNonRefColumnName(VariantFileMetadata fileMetadata) { return getNonRefColumnName(Integer.parseInt(fileMetadata.getId())); }
int expectedCount = 0; for (VariantType variantType : TARGET_VARIANT_TYPE_SET) { expectedCount += variantFileMetadata.getStats().getVariantTypeCounts().getOrDefault(variantType.toString(), 0); for (VariantType type : VariantType.values()) { if (!TARGET_VARIANT_TYPE_SET.contains(type)) { Integer countByType = variantFileMetadata.getStats().getVariantTypeCounts().get(type.toString()); if (countByType != null && countByType > 0) { logger.info(" * Of which " + countByType + " are " + type.toString() + " variants.");
public VariantStudyMetadata getStudyMetadata() { return meta.get().toVariantStudyMetadata(String.valueOf(getStudyId())); }
public VariantTransformTask(VCFHeader header, VCFHeaderVersion version, String studyId, VariantFileMetadata fileMetadata, Path outputFileJsonFile, VariantSetStatsCalculator variantStatsTask, boolean includeSrc, boolean generateReferenceBlocks) { this.variantStatsTask = variantStatsTask; this.factory = null; this.fileMetadata = fileMetadata; this.metadata = fileMetadata.toVariantStudyMetadata(studyId); this.outputFileJsonFile = outputFileJsonFile; this.includeSrc = includeSrc; this.vcfCodec = new FullVcfCodec(); this.vcfCodec.setVCFHeader(header, version); this.converter = new VariantContextToVariantConverter(studyId, fileMetadata.getId(), fileMetadata.getSampleIds()); this.normalizer = new VariantNormalizer(true, true, false); normalizer.setGenerateReferenceBlocks(generateReferenceBlocks); }
/** * Read the VariantSource from an InputStream. * * InputStream must point to a json object. * * @param inputStream Input variant source file * @return Read VariantSource * @throws IOException if there is an error reading */ public static VariantFileMetadata readVariantFileMetadataFromJson(InputStream inputStream) throws IOException { org.opencb.biodata.models.variant.metadata.VariantFileMetadata metadata = new ObjectMapper() .configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true) .readValue(inputStream, org.opencb.biodata.models.variant.metadata.VariantFileMetadata.class); return new VariantFileMetadata(metadata); }
protected List<Variant> readVariants(StudyConfiguration sc, String fileName, Integer fileId, String sampleSufix) { FullVcfCodec codec = new FullVcfCodec(); LineIterator lineIterator = codec.makeSourceFromStream(getClass().getResourceAsStream(fileName)); VCFHeader header = (VCFHeader) codec.readActualHeader(lineIterator); VariantNormalizer normalizer = new VariantNormalizer().configure(header); VariantFileMetadata file = new VariantFileMetadata(fileId.toString(), "file"); VariantStudyMetadata studyMetadata = file.toVariantStudyMetadata(String.valueOf(sc.getStudyId())); VariantVcfHtsjdkReader reader = new VariantVcfHtsjdkReader(getClass().getResourceAsStream(fileName), studyMetadata, normalizer); reader.open(); reader.pre(); List<Variant> variants = reader.read(1000000); reader.post(); reader.close(); sc.getAttributes().append(DEFAULT_GENOTYPE.key(), defaultGenotype); LinkedHashSet<Integer> sampleIds = new LinkedHashSet<>(); LinkedHashMap<String, Integer> samplesPosition = new LinkedHashMap<>(); for (String sample : file.getSampleIds()) { sample = sample + sampleSufix; sc.getSampleIds().putIfAbsent(sample, sc.getSampleIds().size() + 1); sampleIds.add(sc.getSampleIds().get(sample)); samplesPosition.put(sample, samplesPosition.size()); } sc.getFileIds().put(getFileName(fileId), fileId); sc.getSamplesInFiles().put(fileId, sampleIds); for (Variant variant : variants) { variant.getStudies().get(0).setSortedSamplesPosition(samplesPosition); } return variants; }
@Override public VariantFileMetadata convert(VariantSource legacy) { VariantFileMetadata fileMetadata = new VariantFileMetadata(legacy.getFileId(), legacy.getFileName()); fileMetadata.setSampleIds(legacy.getSamples()); fileMetadata.setStats(variantSetStats); fileMetadata.setAttributes(attributes); fileMetadata.setHeader(variantFileHeader);
VariantStudyMetadata metadata = fileMetadata.toVariantStudyMetadata(String.valueOf(studyId)); int numRecords = fileMetadata.getStats().getNumVariants(); int batchSize = options.getInt(Options.LOAD_BATCH_SIZE.key(), Options.LOAD_BATCH_SIZE.defaultValue()); int loadThreads = options.getInt(Options.LOAD_THREADS.key(), Options.LOAD_THREADS.defaultValue()); loadStats.append("writeResult", writeResult); fileMetadata.setId(String.valueOf(fileId)); dbAdaptor.getStudyConfigurationManager().updateVariantFileMetadata(String.valueOf(studyId), fileMetadata); } catch (ExecutionException e) {
normalizer.configure(fileMetadata.getHeader()); normalizer.setGenerateReferenceBlocks(generateReferenceBlocks); String studyId = String.valueOf(getStudyId()); if (VariantReaderUtils.isVcf(input.toString())) { VariantVcfHtsjdkReader reader = new VariantVcfHtsjdkReader(input, fileMetadata.toVariantStudyMetadata(studyId), normalizer); if (null != malformatedHandler) { dataReader = VariantReaderUtils.getVariantReader(input, fileMetadata.toVariantStudyMetadata(studyId)); helper.getStudyId(), Integer.valueOf(helper.getFileMetadata().getId()));
converter = new VariantContextToVariantConverter(metadata.getId(), fileMetadata.getId(), header.getSampleNamesInOrder()); fileMetadata.setHeader(new VCFHeaderToVariantFileHeaderConverter().convert(header)); fileMetadata.setSampleIds(header.getSampleNamesInOrder()); normalizer.configure(fileMetadata.getHeader());
@Override public List<VS> apply(List<VariantFileMetadata> variantFileMetadata) { Set<VS> gaVariantSets = new LinkedHashSet<>(); for (VariantFileMetadata fileMetadata : variantFileMetadata) { List<Object> metadata = new ArrayList<>(); for (VariantFileHeaderComplexLine line : fileMetadata.getHeader().getComplexLines()) { Map<String, List<String>> info = line.getGenericFields().entrySet().stream() .collect(Collectors.toMap(Map.Entry::getKey, value -> Arrays.asList(value.getValue().split(",")))); metadata.add(factory.newVariantSetMetadata(line.getKey(), null, line.getId(), line.getType(), line.getNumber(), line.getDescription(), info)); } fileMetadata.getHeader().getSimpleLines().forEach(line -> metadata.add(factory.newVariantSetMetadata(line.getKey(), line.getValue(), null, null, null, null, Collections.emptyMap()))); @SuppressWarnings("unchecked") VS variantSet = (VS) factory.newVariantSet(fileMetadata.getId(), fileMetadata.getPath(), "", "", (List) metadata); gaVariantSets.add(variantSet); } return new ArrayList<>(gaVariantSets); }
private void stageVariants(StudyConfiguration study, int fileId, List<Variant> variants) throws Exception { String archiveTableName = engine.getArchiveTableName(study.getStudyId()); ArchiveTableHelper.createArchiveTableIfNeeded(dbAdaptor.getGenomeHelper(), archiveTableName); // Create empty VariantFileMetadata VariantFileMetadata fileMetadata = new VariantFileMetadata(String.valueOf(fileId), String.valueOf(fileId)); fileMetadata.setSampleIds(variants.get(0).getStudies().get(0).getOrderedSamplesName()); dbAdaptor.getStudyConfigurationManager().updateVariantFileMetadata(String.valueOf(study.getStudyId()), fileMetadata); // Create dummy reader VariantSliceReader reader = getVariantSliceReader(variants, study.getStudyId(), fileId); // Task supplier Supplier<ParallelTaskRunner.Task<ImmutablePair<Long, List<Variant>>, VcfSliceProtos.VcfSlice>> taskSupplier = () -> { VariantToVcfSliceConverter converter = new VariantToVcfSliceConverter(); return list -> { System.out.println("list.size() = " + list.size()); List<VcfSliceProtos.VcfSlice> vcfSlice = new ArrayList<>(list.size()); for (ImmutablePair<Long, List<Variant>> pair : list) { vcfSlice.add(converter.convert(pair.getRight(), pair.getLeft().intValue())); } return vcfSlice; }; }; // Writer VariantHBaseArchiveDataWriter writer = new VariantHBaseArchiveDataWriter(dbAdaptor.getArchiveHelper(study.getStudyId(), fileId), archiveTableName, dbAdaptor.getHBaseManager()); ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder().setNumTasks(1).build(); ParallelTaskRunner<ImmutablePair<Long, List<Variant>>, VcfSliceProtos.VcfSlice> ptr = new ParallelTaskRunner<>(reader, taskSupplier, writer, config); // Execute stage System.out.println("Stage start!"); ptr.run(); System.out.println("Stage finished!"); }
.collect(Collectors.toList()); StudyEntry entry = new StudyEntry(metadata.getId(), secondaryAlternatesMap, Arrays.asList(format.split(":"))); VariantFileMetadata fileMetadata = new VariantFileMetadata(metadata.getFiles().get(0)); entry.setFileId(fileMetadata.getId()); variant.addStudyEntry(entry);