summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Lecher <jlec@gentoo.org>2015-04-13 08:26:38 +0000
committerJustin Lecher <jlec@gentoo.org>2015-04-13 08:26:38 +0000
commitc3da8ebbcbbbd53d947bcc2ee69003f5ce83173b (patch)
treef3da45b341653809ffe870be6a641930afac1a46 /sci-biology
parentStable for alpha, wrt bug #543928 (diff)
downloadhistorical-c3da8ebbcbbbd53d947bcc2ee69003f5ce83173b.tar.gz
historical-c3da8ebbcbbbd53d947bcc2ee69003f5ce83173b.tar.bz2
historical-c3da8ebbcbbbd53d947bcc2ee69003f5ce83173b.zip
Regenerate protocols, bug #514556
Package-Manager: portage-2.2.18/cvs/Linux x86_64 Manifest-Sign-Key: 0xB9D4F231BD1558AB!
Diffstat (limited to 'sci-biology')
-rw-r--r--sci-biology/goby-cpp/ChangeLog10
-rw-r--r--sci-biology/goby-cpp/Manifest29
-rw-r--r--sci-biology/goby-cpp/files/Alignments.proto597
-rw-r--r--sci-biology/goby-cpp/files/Reads.proto96
-rw-r--r--sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch16
-rw-r--r--sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild16
6 files changed, 751 insertions, 13 deletions
diff --git a/sci-biology/goby-cpp/ChangeLog b/sci-biology/goby-cpp/ChangeLog
index 051b6e226235..d31c9f8ed810 100644
--- a/sci-biology/goby-cpp/ChangeLog
+++ b/sci-biology/goby-cpp/ChangeLog
@@ -1,6 +1,11 @@
# ChangeLog for sci-biology/goby-cpp
-# Copyright 1999-2013 Gentoo Foundation; Distributed under the GPL v2
-# $Header: /var/cvsroot/gentoo-x86/sci-biology/goby-cpp/ChangeLog,v 1.4 2013/03/11 15:22:58 jlec Exp $
+# Copyright 1999-2015 Gentoo Foundation; Distributed under the GPL v2
+# $Header: /var/cvsroot/gentoo-x86/sci-biology/goby-cpp/ChangeLog,v 1.5 2015/04/13 08:26:36 jlec Exp $
+
+ 13 Apr 2015; Justin Lecher <jlec@gentoo.org> +files/Alignments.proto,
+ +files/Reads.proto, +files/goby-cpp-2.0.1-underlinking.patch,
+ goby-cpp-2.0.1.ebuild:
+ Regenerate protocols, bug #514556
11 Mar 2013; Justin Lecher <jlec@gentoo.org> metadata.xml:
Drop Andrey as maintainer so that bugs get assigned to sci-biology directly
@@ -20,4 +25,3 @@
25 Sep 2011; Andrey Kislyuk <weaver@gentoo.org> +goby-cpp-1.9.7.3.ebuild,
+metadata.xml:
New package, ebuild written by me
-
diff --git a/sci-biology/goby-cpp/Manifest b/sci-biology/goby-cpp/Manifest
index 8bd54d280103..4a341a5163ec 100644
--- a/sci-biology/goby-cpp/Manifest
+++ b/sci-biology/goby-cpp/Manifest
@@ -1,18 +1,33 @@
-----BEGIN PGP SIGNED MESSAGE-----
-Hash: SHA256
+Hash: SHA512
+AUX Alignments.proto 26192 SHA256 0f70eb8db279b0dae72a2a59113d68f3151199aed655354532e135bb9325c191 SHA512 c5b67577bfd8432e64489dfb08333adcece8712a398497facb284b52c5b773eff79d71c6dc5bcb72ab21b977423530e27cac3fa2a888c3a00cca1a09faf1b00f WHIRLPOOL 937bc8d18dba2dfb747db3ed38e1d3b5c09cc4e2eedd1ea3edfe5d9128b0a096166b71db217d9ba097c2041ec948e09056e97321e22a2d5b9d4b2d24b3cfc83b
+AUX Reads.proto 3128 SHA256 ca2cd55eadf9ad9e64111ebd461a78456223c781fad99c2705c37cbbeeee215d SHA512 eed972f7f054096ba0e4b215e1551b8acdee6024cec6091abe2b92e268b09fb77d2e9c2a90520611da3bfe37e69a8341e406c4eccb57b0c88f3664f7f2818e27 WHIRLPOOL c6092a49b5932fdc0ef70642d71f43df6d0664ef1223486032012bcffe8c4054105beb5b4d9dfd21448b6c7220f3c29ee81a147fa056d3531a292a6e646d54ad
+AUX goby-cpp-2.0.1-underlinking.patch 708 SHA256 3e6efd71da70481a078f31d751326635d52dc6414885590cf460ec0bd12b97c8 SHA512 511b91d704c981bf54d62b181c7244a1506b792897773ded7b836cbda32dfe3c4f9ab96260d6700fe665adbc7dd84a67dcbfbbb1f817c7f27aab46f8ac6bf8f0 WHIRLPOOL 9d02bb5089eb1a9e9053c31e6dd4befafd465a372a4e5f2a63c137d7967dc530225ae4cb8e33966c40bf62a28efda5ae71d1d60d4f5a75c5107b17bb1e08ae70
DIST goby_1.9.7.3-cpp.zip 127215 SHA256 8493daa7c850732c6c48d4512bd26b7eec411a729b39d9861a4a6aae08faa674
DIST goby_1.9.8.1-cpp.zip 134904 SHA256 2f1bd87f2870af178f34a8e7c11819aa9e42f35e20f1985d2ceb054f452e2a97 SHA512 d31cd7f0be19074bfe8da74d9f2510f0e0f15fe6c485bbed8520052468d2cd2f1bc5fcad8b0d6a1586f5acde73db326059f45994ecfbb5fb6c09692d8e155190 WHIRLPOOL 6ce51c46f8802d31068f510f6da13b2920086eafdae24506830b42d79e48eb6ed9cac48a96090a81964daebf4a0c8f21c490ca3b0af2f589ac57647bde1be79f
DIST goby_2.0.1-cpp.zip 177718 SHA256 5ec57b833cb1a0f53e975112d1c360b14a9b17cfff3fb0ad77dd70672c1881db SHA512 992bd10d5538dec1478820f26151dd311f4de13e7947b49f0b06d6cbdd4b71deeb3aa8a4c6a598fb92fbcb9cbf4ff97bf81205c9389d4a0da4443317e48aea9f WHIRLPOOL ab94cf674703917b6f0cde812d0fbcd94e18fb6055b30d6a1eefa1e4cb5b76bbe18c67388c66e25e87e522df9a9946b0eae5a164428abe874a382f5bc39a13d0
EBUILD goby-cpp-1.9.7.3.ebuild 674 SHA256 7ab267a5443186f13b50d88f9abf982cc2a5a8c78244b0117da98731d9caaca9 SHA512 337011aea5c4da69391757f86ba6d7f3ddf643548b2ed21b38339caea3cb3f6e742756e0c50380cdafcfb169e07305585d87ca95283eff5f945ccaf2f01dacc1 WHIRLPOOL 075bb71cc72d7b1d47f8cf1d9a55d10127fe4d7079370d7dbd94132c06e3c1f8aebcdcb7a025ac4fa2b731f9982ffc32063c81b62d63e42a43516f6efeb7c5e2
EBUILD goby-cpp-1.9.8.1.ebuild 674 SHA256 2fc8efd9e444a0b0a4e4492a31a226f921076b526b2b3a7b29529306d714de2c SHA512 37df8ab6ce8e1713d493f363b0dfc0ef92f676421b36c46ea94f44afd133172bfa61f88079ad25604ecf81c6655ec29428e502144d8449abdcdf968d76cf60ae WHIRLPOOL 07e7b0407a5338fd8e4ac24590bc13d01c99f635eeb55dbf36af94eba936c8d3127c4f22dedc32b1cf3a5b7ed6f12f8544096a09214a868e29cf7c23e488bdb4
-EBUILD goby-cpp-2.0.1.ebuild 833 SHA256 ac944887a446b99e99310abe2e8720008ef93cdbb81f86091bab79ed24891b81 SHA512 8b6a4f062a9b3acdb42a2689cedc5aa9e836c2fd7fc60852021640f91492d5e17240338ad8d8dad070d2075f403c959d1fb9adb773d0af1d263b5f5d95287b43 WHIRLPOOL 447391a6df329c79d015a050b24206b2cff5a6b9264d76cd9ffbac6ab7c242563abb4eb1afb6154336fa8d2c095fecc2d4d1d5cea146d516d1949df50733a5e6
-MISC ChangeLog 777 SHA256 e0a7f577789b8a329bf452cfdda18a2e81e1ab87443bd0fa4d9ead0633f20d98 SHA512 1157f29dc749efc5c42d94b8f9674605c45510c6a3505a81b6f8ab173c8db049063791892cf8c1238eaf3794d31c27e6bfc7358a1067507df14cedc3338ac298 WHIRLPOOL a35d2526b89be231294d082b842268385d24521d32a1d23b492662db63d82443d343ff7d7ba161767ee643bb75718c2923f94983f7d1f0abcf7444cff9dbaf9b
+EBUILD goby-cpp-2.0.1.ebuild 1017 SHA256 da8d19cbfec256bacba9058202cfad8972dc49b803f7e67355277399e26717e8 SHA512 ac27ac6268c908f47407280b81209cf8e6ee5de45c47a53811d547b13e4e6957853490d704bdaac3987a8a166672423288488a0f3f93e70798c4530a905d5802 WHIRLPOOL b34124ad3364d3eebf9dbe99d2349c6611b09daab091a8be1b3bfb7b215b31f62accdfaf319bc81f450cb5476801e9d2a5387a5fed780d1f4d1681775ab039ec
+MISC ChangeLog 974 SHA256 28d6be8ad909e9c94e2a3430daa60e4a6993173fcabcad0b8f8193cb31331ae6 SHA512 a86ba7b4ed3bbebd0a04e5740662d6fe4a46af09eaa331377e606a708cd48dd2155c849d7db5ec78f3e65ab4114c886be85f7188d3bea814007a898ae3058cab WHIRLPOOL 48c453c126c3f430727a05fcfb2c7606ab10031fb978d3b43a240cb18d0dda66344f882219d658fc288684e0328e80cb607e7503a5cfbf4e5aa6916b0606d05b
MISC metadata.xml 166 SHA256 29b228f683c71345323d841414e410c929a320f34536eb30910498728260c8ac SHA512 51c5345bb1c4466b73e2feac8895c64fc119365e7f2c156702f4c93664d3aba028b3da9daaabf24f61a88220345fca7806771a252e8ae906cac5dec97862c7e0 WHIRLPOOL 211bf955d94fc1d93b12388a2c597a8d440fb5d78f84d59b2549569537098c3525b1fbab707441d62fabe20edcac2fd9ebe09c5d9870f1c9558d7ee90d5db5b3
-----BEGIN PGP SIGNATURE-----
-Version: GnuPG v2.0.19 (GNU/Linux)
+Version: GnuPG v2.0
-iEYEAREIAAYFAlE99tMACgkQgAnW8HDreRbKVgCgvjtwiK2Ww3hoGbG7TJzIU0FF
-CWQAn2ivuU2YAmW/jIjH7YiGJ76QF8pb
-=xkvi
+iQJ8BAEBCgBmBQJVK328XxSAAAAAAC4AKGlzc3Vlci1mcHJAbm90YXRpb25zLm9w
+ZW5wZ3AuZmlmdGhob3JzZW1hbi5uZXQyQ0JDQjFGMzBDQ0UxMjFGNENDNDgxMDdC
+OUQ0RjIzMUJEMTU1OEFCAAoJELnU8jG9FVirDYgP/j1sAcQgDxRUdA7c1eZvTR6s
+BLhdZ7QQbmuL4QtqODgbpoPJH7MJ9DXGKILrpR7/VGVMdQGJnUqMTVmJmUJevUOG
+/bCHRQgpMXdnJ8RPsdwcnn4AI1X8H8HkEBuonNSzQUuBqF8YblIwUK4llo6z7OLt
+kJplmBzFAuGOQDGQYyXg30nM+woUKVU9/+7TkRu45ndMd/X95fd2Kvw/oJmwJzxx
+ttSrpTy67OO+4Ipn4ZaFTZdVibQHwjizlqiycR3nBWtHqoGqArSwxQIHisrDVIfv
+t8ZvE7PFzy6Rl58NGPl3yxNN4af7hEi13Nai/gPqvlDwWccQ+Ec7ciiyxc3POdzp
+cQymOBd41xoAhCD3I552zrTtZG3yGRtdO0IWpu3wbJsYWVy+qivT701Rr7ZH9IWq
+cnIWrU6gjQ5urKz8769hgJyVfB54a98ahvjhTRLVSLcy2y+BEDpAmXEBhdS2mcfB
+b7ldWLgv2areHqzL4/Aj4WAgJEbHuTG5V/Vs/v/WTqm6mB4yMT507Ns53lqAkQE/
+h+UDkyWDhpvJFtCPr3gYpaCyJ6MLdfiwwG7I8Q3lEbTJH+qevn85zeOeltjaM4hm
+3xO35lIS3iijm52LXJmnlBvdYNsNDUYmcNFZphKNsdXf593bx2ZmBgzLEeIusIX1
++rzY5S/+C0bpM2igMVfs
+=Rm4c
-----END PGP SIGNATURE-----
diff --git a/sci-biology/goby-cpp/files/Alignments.proto b/sci-biology/goby-cpp/files/Alignments.proto
new file mode 100644
index 000000000000..fe7f56647644
--- /dev/null
+++ b/sci-biology/goby-cpp/files/Alignments.proto
@@ -0,0 +1,597 @@
+package goby;
+
+option java_package = "edu.cornell.med.icb.goby.alignments";
+
+option optimize_for = SPEED;
+
+/*
+ This message is written to 'basename'.entries as a very large chunked collection.
+*/
+message AlignmentCollection {
+ repeated AlignmentEntry alignment_entries = 1;
+}
+
+
+message AlignmentEntry {
+ /* Multiplicity of this entry. The number of times this alignment entry would be repeated exactly the same if
+ query redundancy had not been removed by read factorization.
+ */
+ optional uint32 multiplicity = 7;
+
+ /*
+ Compressed stream of data. Removed since Goby 2.0 supports chunk codecs. Do not reuse field index 23
+ optional bytes compressed_data = 23;
+ */
+
+ /* An integer that uniquely identifies the query (a short read) in a set of alignment runs. When several
+ alignment runs are made with the same set of query sequences, equality of query index means that the query
+ sequences were the same. (Comparing integers for equality is much faster than comparing strings.)
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 query_index = 1;
+ /* An integer that uniquely identifies the target (e.g., a chromosome) in a set of alignment runs. When several
+ alignment runs are made with the same set of target sequences, equality of target index means that the target
+ sequence was the same across the runs. (Comparing integers for equality is much faster than comparing strings.)
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 target_index = 2;
+ /*
+ The position on the target of the start of the alignment between the query and the target.
+ In the following example, position is 3 because the third base of the query 'C' was aligned with
+ position 3 of the reference (two read bases were soft clipped: "ct"). This example shows that the
+ alignment can start at a mismatch if it was so constructed by the aligner.
+
+ 0123456789
+ AAAAGTCAAA target
+ ctCGTC query
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 position = 3;
+
+ /*
+ True when the query matches the target on the reverse strand
+ */
+ optional bool matching_reverse_strand = 6;
+
+ /*
+ The position on the query where the alignment starts. This value is different from zero
+ when some bases/residues of the query could not be aligned with the target.
+ TODO: Rename this to left_trim. Add a right_trim property.
+ */
+ optional uint32 query_position = 5;
+
+ /*
+ The score of the alignment, where larger scores indicate better matches between the query and the target.
+ If an aligner outputs only the number of mismatches between query and target, the score is taken to be
+ -(#mismatches(query,target)).
+ */
+ optional float score = 4;
+
+ /*
+ Number of bases/residues that differ in the alignment between query and target sequences.
+ */
+ optional uint32 number_of_mismatches = 8;
+
+ /*
+ Cumulative number of insertions and/or deletions present in the alignment.
+ */
+ optional uint32 number_of_indels = 9;
+
+ /*
+ Number of bases that have been aligned for the query. Please note that query_aligned_length must be
+ less or equal to query_length.
+ */
+ optional uint32 query_aligned_length = 11;
+
+ /*
+ Number of bases that have been aligned for the target.
+ */
+ optional uint32 target_aligned_length = 12;
+
+ repeated SequenceVariation sequence_variations = 13;
+
+ /*
+ Length of the query sequence.
+ */
+ optional uint32 query_length = 10;
+ /*
+ Mapping Quality (phred-scaled posterior probability that the mapping
+ position of this read is incorrect). Please note that different aligners
+ may estimate mapping quality with different approaches, resulting in aligner
+ specific differences in the distribution of mapping quality. It is recommended
+ to condition mapping quality on the aligner that produced the specific alignment
+ being processed. See aligner name and version in the header.
+ Note that the following description is preliminary. A clear specification is
+ needed:
+ The mapping quality should be proportional to the
+ log of the probability that the given mapping is the "correct" one.
+ So if there are five equally good mappings of a read to the genome,
+ the probability of each would be 0.2, and the mapping quality would be
+ something like -10*log10(1-0.2) = 1. If a mapping is highly likely,
+ say a 1e-4 of it being wrong, then the mapping quality would be
+ -10*log10(1e-4) = 40.
+ */
+ optional int32 mapping_quality = 14;
+
+ /*
+ If this read was aligned with a pair, the flags for the pair alignment (based on SAM):
+ 000000001 paired
+ 000000010 properly paired
+ 000000100 read unmapped
+ 000001000 mate unmapped
+ 000010000 read reverse strand
+ 000100000 mate reverse strand
+ 001000000 first in pair
+ 010000000 second in pair
+ 100000000 not primary alignment
+ */
+ optional uint32 pair_flags = 15;
+
+ /*
+ If there is an alignment entry for the paired read (the paired read was mapped), a link to the entry is given.
+ */
+ optional RelatedAlignmentEntry pair_alignment_link = 16;
+
+ /* Index of the read fragment from which this alignment was obtained. */
+ optional uint32 fragment_index = 17;
+
+ /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more
+ alignment entries, one for each matching part of the read, and link these entries with
+ spliced_alignment_links. The field spliced_forward_alignment_link points to the next
+ AlignmentEntry in the chain of spliced alignments.
+ */
+ optional RelatedAlignmentEntry spliced_forward_alignment_link = 18;
+
+ /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more
+ alignment entries, one for each matching part of the read, and link these entries with
+ spliced_alignment_links. The field spliced_backward_alignment_link points to the previous
+ AlignmentEntry in the chain of spliced alignments.
+ */
+ optional RelatedAlignmentEntry spliced_backward_alignment_link = 22;
+
+ /*
+ If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two alignment entries, one for each
+ matching part of the read, and flag describes the spliced_alignment_link with these
+ binary flags:
+ 000000001 normal
+ 000000010 novel
+ */
+ optional uint32 spliced_flags = 19;
+
+ /* The size of the insert used when making the sequence library. This is the total size of the DNA
+ fragment to sequence, without the adapters. This is not the length of sequence that separates the reads.
+ See http://seqanswers.com/forums/showthread.php?t=8730 for details. Insert size is inferred for each pair
+ of reads by the aligner and is recorded here if was estimated (i.e., for paired-end reads).
+ */
+ optional sint32 insert_size = 20;
+
+ /*
+ The sample index. Uniquely identifies the aligned sample this read was read from. Storing the sample index in the
+ alignment entry makes it possible to concat alignments from different origins and track what sample originally
+ contained each entry.
+ */
+ optional uint32 sample_index = 21;
+ /*
+ The total number of times the query index associated with this entry occurs across the entire alignment file.
+
+ This field is used to purge queryIndex->smallIndex associations after all instances of a queryindex have
+ been seen (see QueryIndexPermutation class). When each entry has a value for this field, the header field
+ query_index_occurrences is true.
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 query_index_occurrences = 25;
+ /*
+ The total number of times the read matches the reference across the entire alignment file. This differs from
+ query_index_occurrences because reads that are matching through splice and pair links count as one for ambiguity.
+ The field can be used to filter by ambiguity-threshold on the fly after an alignment has been done (to restrict
+ entries to more smaller thresholds). When each entry has a value for this field, the header field
+ ambiguity_stored_in_entries is true.
+
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 ambiguity = 27;
+ /*
+ List of BAM attributes, if the alignment was imported from BAM. The attributes are stored in exactly the format
+ allowed for BAM. For instance, X0:i:9 X1:i:1 MD:Z:68 RG:Z:SRR084825 will be stored as four strings:
+ "X0:i:9", "X1:i:1", "MD:Z:68", "RG:Z:SRR084825". Note that sam-to-compact will interpret some BAM attributes
+ and populate goby native fields. Such tags do not appear in bam_attributes, and are instead re-generated from
+ the corresponding goby native fields.
+ Since Goby 2.0.
+ */
+ repeated string bam_attributes = 50;
+ /*
+ Quality scores for all bases of the read.
+ Since Goby 2.0.
+ */
+ optional bytes read_quality_scores = 55;
+
+ /*
+ Origin index. An integer that references a ReadOriginInfo message in the alignment header and
+ makes it possible to track the origin of the read (especially useful after several alignments
+ have been merged/concatenated).
+ (Since Goby 2.0).
+ */
+ optional uint32 read_origin_index = 26;
+ /*
+ Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially
+ erroneous bases, or bases that belong to a different part of the reference genome. Left clipped bases are
+ stored in this field as character bases, or as an equal sign character '=' when the clipped base did match
+ the reference base. For instance "A=G" for three soft-clipped bases, the middle one matching the genome at
+ this position. The number of bases in softClippedBasesLeft is exactly equal to queryPosition.
+ */
+ optional string softClippedBasesLeft = 30;
+ /*
+ Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially
+ erroneous bases, or bases that belong to a different part of the reference genome. Right clipped bases are
+ stored in this field as character bases, or as an equal sign character '=' when the clipped base did match
+ the reference base. The number of bases in softClippedBasesRight is exactly equal
+ to queryLength - queryAlignedLength - queryPosition.
+ */
+ optional string softClippedBasesRight = 31;
+
+ /*
+ Quality scores for bases in softClippedBasesLeft. Stored in Phred Units.
+ */
+ optional bytes softClippedQualityLeft = 32;
+ /*
+ Quality scores for bases in softClippedBasesRight. Stored in Phred Units.
+ */
+ optional bytes softClippedQualityRight = 33;
+ /*
+ Sequence for a read placed near this entry, but unmapped to the reference sequence. For instance, used to record
+ the sequence of a mate that did not map to the reference. We know that the mate maps in the proximity of this entry
+ (it is placed) but are unable to map it to a specific genomic position. The sequence is always given as obtained
+ from the reads file.
+ */
+ optional string placedUnmappedSequence=40;
+ /*
+ Quality scores for a read placed near this entry. Phred units.
+ */
+ optional bytes placedUnmappedQuality=41;
+
+ /*
+ Read name. In SAM/BAM this is referred to as QNAME. Paired and segmented reads will have the same Read name.
+ */
+ optional string readName=42;
+}
+
+/* A link to another alignment entry. This message type is used to represent relations
+ between alignments, such as the relation between the two read fragments in a paired-end protocol,
+ or the relation between parts of reads that align through an exon exon junction and map in
+ different locations of the genome.
+ */
+message RelatedAlignmentEntry {
+ /* Target index of the location where the other alignment entry is mapped.
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 target_index = 1;
+
+ /* Position on the reference where the other alignment entry is mapped. *
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 position = 2;
+
+ /* Index of the fragment for the related alignment entry. This index
+ makes it possible to identify which of the read fragments mapped to the given
+ location is related to the source alignment entry.
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 fragment_index = 3;
+
+ optional uint32 optimized_index=50;
+}
+
+/*
+ Represents sequence variations between the query and the reference sequences. Many variations can be represented.
+ For instance, an insertion at position 5 in the reference would be represented as from="A", to="" position=5.
+ A mutation T->G at position 6 would be rendered as from="T", to="G" position=6. Padded alignments (see SAM description)
+ can be described by a combination of pair-wise alignments, where the gap character '-' is used to indicate that no
+ base exists in the sequence considered for the alignment position, for instance:
+
+ - Padding example:
+
+ 123 (<-positions)
+ref A-C
+ A-T [from="-" to="" position=2] [from="C" to="T" position=3]
+ ACT [from="" to="C" position=2] [from="C" to="T" position=3]
+ A-T [from="-" to="" position=2] [from="C" to="T" position=3]
+
+ - Mutation example:
+ 123 (<-positions)
+ref ATT
+ ACT [from="T" to="C" position=2]
+
+ -- Example of deletion in a read:
+ 123 (<-positions)
+ref ATT
+ A-T [from="T" to="-" position=2]
+
+ -- Example of insertion of two base pairs in a read:
+ 12345 (<-positions)
+ref A--TT
+ ACCTT [from="" to="CC" position=2]
+
+ */
+message SequenceVariation {
+ /* The reference bases. Can include one or more gap characters '-', to indicate that the reference sequence has
+ no base at this alignment position.
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional string from = 2;
+ /* The read bases that differ from the reference sequence. Can include one or more gap characters '-', to indicate
+ that the query sequence has no base at this alignment position.
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional string to = 1;
+ /*
+ The position of the variation on the read, as if the read always matched on the forward strand.
+ Adding position to the index where the reference starts aligning the read yields the position of the variation
+ in reference/target sequence space. Since position starts at one the resulting position will also be one based.
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 position = 3;
+ /*
+ The position of the variation, starting from the beginning of the aligned read (position 1), and up to the length
+ of the read (inclusive). Use this index if you need to know how far the variation is observed from the beginning
+ of the sequenced read. When the read has an insertion, this index records the position immediately before the base
+ where the bases are inserted (these bases are in the to field).
+ When the read has a deletion, read_index records the position in the read after which the bases that would align
+ in the reference are missing (these bases are in the from field).
+ This field is required (enforced by semantic validation in Goby 2.0+).
+ */
+ optional uint32 read_index = 5;
+
+ /**
+ The read base quality scores for those bases that are given in the to field. This field
+ is populated when the reads used to perform the search include quality scores, and when
+ the alignment parser can extract the information from the aligner's output.
+ (this option is currently not implemented in Goby.)
+ */
+ optional bytes to_quality = 4;
+
+}
+/*
+ This message is written to 'basename'.header
+*/
+
+message AlignmentHeader {
+ /*
+ The smallest possible query index in this alignment. Data stored as an array where
+ queryIndex is the array index will be stored with only the elements in the inclusive
+ range [smallestSplitQueryIndex largestSplitQueryIndex]
+ Such data structures include queryLength and some arrays in the TooManyHits data
+ structure.
+ */
+ optional uint32 smallest_split_query_index = 9;
+ /*
+ The largest possible query index in this alignment. Data stored as an array where
+ queryIndex is the array index will be stored with only the elements in the inclusive
+ range [smallestSplitQueryIndex largestSplitQueryIndex]
+ Such data structures include queryLength and some arrays in the TooManyHits data
+ structure.
+ */
+ optional uint32 largest_split_query_index = 11;
+
+ /* Mapping from query identifier name to query index (as used in alignment entries).
+ */
+ optional IdentifierMapping query_name_mapping = 1;
+
+ /* Mapping from target identifier name to target index (as used in alignment entries).
+ */
+ optional IdentifierMapping target_name_mapping = 2;
+
+ /*
+ The number of query sequences
+ */
+ optional uint32 number_of_queries = 5;
+ /*
+ The number of target sequences
+ */
+ optional uint32 number_of_targets = 6;
+ /*
+ The number of reads that were aligned to the reference and are represented in this alignment archive.
+ */
+ optional uint32 number_of_aligned_reads = 7;
+
+ /*
+ Length of the query sequences. One number per query, in the order of increasing query index.
+ This information has been moved to the individual alignment entries.
+ */
+ repeated uint32 query_length = 3 [deprecated = true];
+ /*
+ If query length is constant across all the queries, this field contains the constant length.
+ In such cases, query_length will be empty.
+ */
+ optional uint32 constant_query_length = 10;
+
+ /*
+ Length of the target sequences. One number per target, in the order of increasing target index.
+ The target indexes must be 0..(number of targets - 1).
+ */
+ repeated uint32 target_length = 8;
+ /*
+ Indicates whether this alignment is sorted by position. True: the alignment entries occur in sorted
+ order, such that entry a occurs before entry b if a.targetIndex< b.targetIndex or, when entries
+ have the same target, when a.position < b.position.
+ */
+ optional bool sorted = 13;
+
+ /*
+ Indicates whether this alignment is indexed by position. When this attribute is true, a file called
+ 'basename'.index exists that contains the AlignmentIndex message (GZip compressed).
+ */
+ optional bool indexed = 14;
+ /*
+ True when query lengths are stored in alignment entries (Goby 1.7+).
+ */
+ optional bool query_lengths_stored_in_entries = 15;
+ /*
+ Name of the aligner that produced this alignment.
+ */
+ optional string aligner_name = 17;
+ /*
+ Version number for the aligner implementation that produced this alignment.
+ */
+ optional string aligner_version = 18;
+ /*
+ The version of Goby that created this alignment file.
+ */
+ optional string version = 25;
+
+ /*
+ Sample basenames, in the order of increasing sampleIndex, starting with sampleIndex=0.
+ */
+
+ repeated string sample_basename = 30;
+
+ /*
+ This field is true when the query indices of alignment entries were permuted to smaller indices. Only sorted
+ alignments can have query_indices_were_permuted=true. When the field is true, and you need to retrieve the
+ original query-index of an alignment (because you want to retrieve the specific read(s) from a read file for
+ instance), you will need the information in the permutation file (extension basename.perm) and transform back
+ each small index of interest to the original query index.
+ */
+ optional bool query_indices_were_permuted = 26;
+ /*
+ This field is true when entries in the alignment .entries file all have the query_index_occurrences field populated
+ (Since Goby 2.0).
+ */
+ optional bool query_index_occurrences = 35;
+
+ /*
+ This field is true when entries in the alignment .entries file all have the ambiguity field populated
+ (Since Goby 2.0).
+ */
+ optional bool ambiguity_stored_in_entries = 36;
+ /*
+ This field is true when entries in the alignment .entries file all have the read_quality_score field populated.
+ (Since Goby 2.0).
+ */
+ optional bool all_read_quality_scores = 40;
+ /*
+ A description of the origin of sets of reads. Serves a similar function to BAM read groups, but more flexible and
+ efficient. Instead of storing strings, we use integers in the entries.
+ Alignemnt entries will link to a specific ReadOriginInfo with the origin_index field.
+ (Since Goby 2.0).
+ */
+ repeated ReadOriginInfo read_origin = 27;
+}
+
+message IdentifierMapping {
+ repeated IdentifierInfo mappings = 1;
+}
+
+message IdentifierInfo {
+ required string name = 1;
+ required uint32 index = 2;
+}
+
+
+/*
+ A description of the origin of sets of reads. Stored in the Goby alignment header and linked
+ from alignment entries. Goby makes it possible to adapt origin equivalence rules on the fly
+ efficiently. To do this, it is sufficient to read the header of the alignment, decide which
+ ReadOriginInfo instances are equivalent (e.g., by looking at sample, platform, library, or
+ other fields in the message), then construct a function e(a):int. This function takes
+ one originIndex parameter and returns another integer that maps to an equivalent class. The
+ equivalence class can be used to estimate error models for entries that belong to each class,
+ for instance.
+ (Since Goby 2.0).
+ */
+message ReadOriginInfo {
+ /*
+ Origin index. An integer that links alignment entries to their origin information.
+ */
+ required uint32 origin_index = 1;
+ /*
+ Identifier that describes the origin of the reads. This field is compatible with the ID/platform field of BAM read
+ groups. Free text.
+ */
+ required string origin_id = 2;
+ /*
+ The sample from which the reads were sequenced. This field is compatible with the SM/sample field of BAM read
+ groups. Free text.
+ */
+ optional string sample = 4;
+ /*
+ The platform on which the reads were sequenced. This field is compatible with the PL/platform field of BAM read
+ groups. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.
+ */
+ optional string platform = 5;
+ /*
+ The library from which the reads were sequenced. This field is compatible with the LB/library field of BAM read
+ groups. Free text.
+ */
+ optional string library = 8;
+ /*
+ The platform unit on which the reads were sequenced. This field for compatibility with samtools.
+ */
+ optional string platform_unit = 12;
+ /*
+ The date the reads were sequenced. Useful to identify batch effects, in the format dd:MMM:yyyy.
+ The month is Jan, Feb, etc. to avoid all confusion with days when day<=12.
+ */
+ optional string run_date = 6;
+}
+
+/*
+ This message is written to 'basename'.tmh
+*/
+
+message AlignmentTooManyHits {
+ /*
+ The threshold used by the aligner to determine that a query is ambiguous and should be dropped.
+ Referred to as parameter k below.
+ */
+ required uint32 aligner_threshold = 2;
+ /*
+ The hits that are assigned to several (>k) reference location.
+ */
+ repeated AmbiguousLocation hits = 1;
+
+}
+
+message AmbiguousLocation {
+ /*
+ The index of the query that matched too many times.
+ */
+ required uint32 query_index = 1;
+ /*
+ The number of hits that triggered membership in the too many hits list. The query may hit more
+ locations than reported here, since some alignment tools will just drop queries that match above
+ a threshold and stop counting. This number can be >=k.
+ */
+ required uint32 at_least_number_of_hits = 2;
+ /**
+The length of the part of the query sequence that could be matched to the target (also called depth).
+May be less than the length of the query sequence, in which case the match was not perfect. When merging
+alignments produced by searching different reference sequences, consider only at_least_number_of_hits
+from alignments that have exactly the longer depth for the query. */
+ optional uint32 length_of_match = 3;
+}
+
+/*
+ This message is written to 'basename'.index
+ */
+message AlignmentIndex {
+ /*
+ Stores one element by target sequence. Each element is the cumulative target length for the target
+ stored at index i. Assume there are four target sequences, with lengths {10, 12, 15, 34}. The field
+ targetPositionOffsets will contain: {0,10,22,37}. Such offsets can be used to calculate the absolute
+ position of a genomic location. Given targetIndex and positionOnReference, the absolute location
+ is defined as targetPositionOffsets[targetIndex]+positionOnReference.
+ */
+ repeated uint32 target_position_offsets = 1 [packed = true];
+ /*
+ The byte offsets into the compressed entries file. Byte offsets are matched with absolute position
+ by index. There should be as many elements in offsets as there are in absolutePosition
+ where chunks start which represent entries whose absolute positions are less than
+ */
+ repeated uint64 offsets = 2 [packed = true];
+ /*
+ The absolute positions of the first entry in the chunk that immediately start at offset. One element
+ per chunk in the 'basename'.entries file.
+ */
+ repeated uint64 absolute_positions = 3 [packed = true];
+
+}
diff --git a/sci-biology/goby-cpp/files/Reads.proto b/sci-biology/goby-cpp/files/Reads.proto
new file mode 100644
index 000000000000..32c1244a3eb3
--- /dev/null
+++ b/sci-biology/goby-cpp/files/Reads.proto
@@ -0,0 +1,96 @@
+package goby;
+
+option java_package = "edu.cornell.med.icb.goby.reads";
+option optimize_for = SPEED;
+
+message ReadCollection {
+ repeated ReadEntry reads = 1;
+}
+
+message ReadEntry {
+ /*
+ Index of a read.
+ */
+ required uint32 read_index = 1;
+ /*
+ Index of the barcode, if any.
+ */
+ optional uint32 barcode_index = 10;
+ /*
+ Read identifier/name may be present.
+ */
+ optional string read_identifier = 23;
+ /*
+ Additional description about the read (from Fasta/Q format).
+ */
+ optional string description = 22;
+ /*
+ Length of the sequence.
+ */
+ required uint32 read_length = 2;
+ /*
+ Sequence, encoded as ascii characters stored in single bytes.
+ */
+ optional bytes sequence = 3;
+ /*
+ The second sequence in a pair. Stored the same way as the sequence attribute.
+ */
+ optional bytes sequence_pair = 5;
+ /*
+ Length of the second sequence in a pair.
+ */
+ optional uint32 read_length_pair = 6;
+ /*
+ Quality scores in Phred units, stored as single bytes (0-255).
+ */
+ optional bytes quality_scores = 4;
+ /*
+ Quality scores for the second sequence in a pair. Stored as the 'qualityScores' attribute.
+ */
+ optional bytes quality_scores_pair = 7;
+ /*
+ Compressed stream of data. The first byte indicates the compression/decompression method (codec). The remaining bytes are
+ content compressed with the codec.
+ */
+ optional bytes compressed_data = 8;
+ /*
+ Stores meta-data about the reads. Typically meta-data is stored in the very first read of a
+ read collection, with the understanding that the meta-data applies to all the reads in the
+ collection. Meta-data can be used to store information about when the sample was sequenced,
+ or other information of interest. The key-value pair format is sufficiently flexible to
+ accomodate a variety of needs. The following keys are pre-defined. Please use pre-defined
+ keys so that automated tools can use metadata in relatively standard way. Please note that
+ some keys provide a format for the value. This format should also be followed to garantee
+ that meta data can be used computationally in fully automatic manner.
+
+ key="sequencing-run-start-date" value="MM/DD/YYYY" Used to record when the sequencing run
+ was initiated on the instrument. Can be used to detect batch effect in a large set of samples.
+ key="platform" value="<free-text>". Value is free text, but the following terms are pre-defined.
+ Illumina GaIIx
+ Illumina HiSeq 1000
+ Illumina HiSeq 2000
+ Helicos Heliscope
+ LifeTech 5500 SOLiD
+ LifeTech 5500xl SOLiD
+ Roche 454 GS FLX Ti
+
+ key="organism" value="species name"
+ Since Goby 1.9.1
+ */
+ repeated MetaData meta_data = 25;
+
+}
+/*
+ A message to store a key/value pair and represent metadata about reads.
+ Since Goby 1.9.1
+ */
+message MetaData {
+ /*
+ Provides the key. See examples in the documentation of meta_data for ReadEntry.
+ */
+ required string key=1;
+ /*
+ Describes the value associated with the key. See examples in the documentation of meta_data for ReadEntry.
+ */
+ required string value=2;
+}
diff --git a/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch b/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch
new file mode 100644
index 000000000000..415785466af7
--- /dev/null
+++ b/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch
@@ -0,0 +1,16 @@
+ src/Makefile.am | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/Makefile.am b/src/Makefile.am
+index 1033382..33ca906 100644
+--- a/src/Makefile.am
++++ b/src/Makefile.am
+@@ -84,7 +84,7 @@ GobyReadsStats_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_D
+ GobyReadsStats_SOURCES = \
+ GobyReadsStats.cc
+
+-GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB}
++GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB} -lz
+ GobyFastaToCompact_SOURCES = \
+ GobyFastaToCompact.cc
+
diff --git a/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild
index 39995da72908..31eddfe16086 100644
--- a/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild
+++ b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild
@@ -1,8 +1,8 @@
-# Copyright 1999-2012 Gentoo Foundation
+# Copyright 1999-2015 Gentoo Foundation
# Distributed under the terms of the GNU General Public License v2
-# $Header: /var/cvsroot/gentoo-x86/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild,v 1.1 2012/07/19 10:46:06 jlec Exp $
+# $Header: /var/cvsroot/gentoo-x86/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild,v 1.2 2015/04/13 08:26:36 jlec Exp $
-EAPI=4
+EAPI=5
AUTOTOOLS_AUTORECONF=yes
@@ -24,9 +24,19 @@ RDEPEND="${DEPEND}"
S="${WORKDIR}/${PV}/cpp"
+PATCHES=(
+ "${FILESDIR}"/${P}-underlinking.patch
+)
+
src_prepare() {
sed \
-e '/BUILD_TIMESTAMP/s:\(goby/timestamp.h\):$(top_srcdir)/src/\1:g' \
-i src/Makefile.am || die
+
+ pushd src/goby > /dev/null || die
+ cp "${FILESDIR}"/*.proto . || die
+ protoc --cpp_out=. *.proto || die
+ popd > /dev/null || die
+
autotools-utils_src_prepare
}