jlec        15/04/13 08:26:36

  Added:                Alignments.proto goby-cpp-2.0.1-underlinking.patch
                        Reads.proto
  Log:
  Regenerate protocols, bug #514556
  
  (Portage version: 2.2.18/cvs/Linux x86_64, signed Manifest commit with key 
B9D4F231BD1558AB!)

Revision  Changes    Path
1.1                  sci-biology/goby-cpp/files/Alignments.proto

file : 
http://sources.gentoo.org/viewvc.cgi/gentoo-x86/sci-biology/goby-cpp/files/Alignments.proto?rev=1.1&view=markup
plain: 
http://sources.gentoo.org/viewvc.cgi/gentoo-x86/sci-biology/goby-cpp/files/Alignments.proto?rev=1.1&content-type=text/plain

Index: Alignments.proto
===================================================================
package goby;

option java_package = "edu.cornell.med.icb.goby.alignments";

option optimize_for = SPEED;

/*
  This message is written to 'basename'.entries as a very large chunked 
collection.
*/
message AlignmentCollection {
    repeated AlignmentEntry alignment_entries = 1;
}


message AlignmentEntry {
    /* Multiplicity of this entry. The number of times this  alignment entry 
would be repeated exactly the same if
     query redundancy had not been removed by read factorization.
    */
    optional uint32 multiplicity = 7;

    /*
      Compressed stream of data. Removed since Goby 2.0 supports chunk codecs. 
Do not reuse field index 23
      optional bytes compressed_data = 23;
    */

    /* An integer that uniquely identifies the query (a short read) in a set of 
alignment runs. When several
      alignment runs are made with the same set of query sequences, equality of 
query index means that the query
      sequences were the same. (Comparing integers for equality is much faster 
than comparing strings.)
      This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 query_index = 1;
    /* An integer that uniquely identifies the target (e.g., a chromosome) in a 
set of alignment runs. When several
      alignment runs are made with the same set of target sequences, equality 
of target index means that the target
      sequence was the same across the runs. (Comparing integers for equality 
is much faster than comparing strings.)
      This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 target_index = 2;
    /*
     The position on the target of the start of the alignment between the query 
and the target.
     In the following example, position is 3 because the third base of the 
query 'C' was aligned with
     position 3 of the reference (two read bases were soft clipped: "ct"). This 
example shows that the
     alignment can start at a mismatch if it was so constructed by the aligner.

     0123456789
     AAAAGTCAAA  target
      ctCGTC     query
    This field is required (enforced by semantic validation in Goby 2.0+).
   */
    optional uint32 position = 3;

    /*
       True when the query matches the target on the reverse strand
    */
    optional bool matching_reverse_strand = 6;

    /*
     The position on the query where the alignment starts. This value is 
different from zero
     when some bases/residues of the query could not be aligned with the target.
     TODO: Rename this to left_trim. Add a right_trim property.
    */
    optional uint32 query_position = 5;

    /*
     The score of the alignment, where larger scores indicate better matches 
between the query and the target.
     If an aligner outputs only the number of mismatches between query and 
target, the score is taken to be
     -(#mismatches(query,target)).
    */
    optional float score = 4;

    /*
      Number of bases/residues that differ in the alignment between query and 
target sequences.
    */
    optional uint32 number_of_mismatches = 8;

    /*
     Cumulative number of insertions and/or deletions present in the alignment.
    */
    optional uint32 number_of_indels = 9;

    /*
     Number of bases that have been aligned for the query. Please note that 
query_aligned_length must be
     less or equal to query_length.
    */
    optional uint32 query_aligned_length = 11;

    /*
     Number of bases that have been aligned for the target.
    */
    optional uint32 target_aligned_length = 12;

    repeated SequenceVariation sequence_variations = 13;

    /*
     Length of the query sequence.
    */
    optional uint32 query_length = 10;
    /*
      Mapping Quality (phred-scaled posterior probability that the mapping
      position of this read is incorrect). Please note that different aligners
      may estimate mapping quality with different approaches, resulting in 
aligner
      specific differences in the distribution of mapping quality. It is 
recommended
      to condition mapping quality on the aligner that produced the specific 
alignment
      being processed. See aligner name and version in the header.
      Note that the following description is preliminary. A clear specification 
is
      needed:
      The mapping quality should be proportional to the
      log of the probability that the given mapping is the "correct" one.
      So if there are five equally good mappings of a read to the genome,
      the probability of each would be 0.2, and the mapping quality would be
      something like -10*log10(1-0.2) = 1.  If a mapping is highly likely,
      say a 1e-4 of it being wrong, then the mapping quality would be
      -10*log10(1e-4) = 40.
    */
    optional int32 mapping_quality = 14;

    /*
       If this read was aligned with a pair, the flags for the pair alignment 
(based on SAM):
          000000001    paired
          000000010    properly paired
          000000100    read unmapped
          000001000    mate unmapped
          000010000    read reverse strand
          000100000    mate reverse strand
          001000000    first in pair
          010000000    second in pair
          100000000    not primary alignment
    */
    optional uint32 pair_flags = 15;

    /*
     If there is an alignment entry for the paired read (the paired read was 
mapped), a link to the entry is given.
    */
    optional RelatedAlignmentEntry pair_alignment_link = 16;

    /* Index of the read fragment from which this alignment was obtained. */
    optional uint32 fragment_index = 17;

    /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will 
output two or more
      alignment entries, one for each matching part of the read, and link these 
entries with
      spliced_alignment_links. The field spliced_forward_alignment_link points 
to the next
      AlignmentEntry in the chain of spliced alignments.
    */
    optional RelatedAlignmentEntry spliced_forward_alignment_link = 18;

    /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will 
output two or more
      alignment entries, one for each matching part of the read, and link these 
entries with
      spliced_alignment_links. The field spliced_backward_alignment_link points 
to the previous
      AlignmentEntry in the chain of spliced alignments.
    */
    optional RelatedAlignmentEntry spliced_backward_alignment_link = 22;

    /*
      If a read spans exon-exon junctions some aligners (e.g., GSNAP) will 
output two alignment entries, one for each
      matching part of the read, and flag describes the spliced_alignment_link 
with these
      binary flags:
        000000001    normal
        000000010    novel
    */
    optional uint32 spliced_flags = 19;

    /* The size of the insert used when making the sequence library. This is 
the total size of the DNA
    fragment to sequence, without the adapters. This is not the length of 
sequence that separates the reads.
    See http://seqanswers.com/forums/showthread.php?t=8730 for details. Insert 
size is inferred for each pair
    of reads by the aligner and is recorded here if was estimated (i.e., for 
paired-end reads).
    */
    optional sint32 insert_size = 20;

    /*
       The sample index. Uniquely identifies the aligned sample this read was 
read from. Storing the sample index in the
       alignment entry makes it possible to concat alignments from different 
origins and track what sample originally
       contained each entry.
    */
    optional uint32 sample_index = 21;
    /*
        The total number of times the query index associated with this entry 
occurs across the entire alignment file.

        This field is used to purge queryIndex->smallIndex associations after 
all instances of a queryindex have
        been seen (see QueryIndexPermutation class). When each entry has a 
value for this field, the header field
        query_index_occurrences is true.
        This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 query_index_occurrences = 25;
    /*
        The total number of times the read matches the reference across the 
entire alignment file. This differs from
        query_index_occurrences because reads that are matching through splice 
and pair links count as one for ambiguity.
        The field can be used to filter by ambiguity-threshold on the fly after 
an alignment has been done (to restrict
        entries to more smaller thresholds). When each entry has a value for 
this field, the header field
        ambiguity_stored_in_entries is true.

        This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 ambiguity = 27;
    /*
        List of BAM attributes, if the alignment was imported from BAM. The 
attributes are stored in exactly the format
        allowed for BAM. For instance, X0:i:9  X1:i:1  MD:Z:68 RG:Z:SRR084825 
will be stored as four strings:
        "X0:i:9", "X1:i:1", "MD:Z:68", "RG:Z:SRR084825". Note that 
sam-to-compact will interpret some BAM attributes
        and populate goby native fields. Such tags do not appear in 
bam_attributes, and are instead re-generated from
        the corresponding goby native fields.
        Since Goby 2.0.
    */
    repeated string bam_attributes = 50;
    /*
        Quality scores for all bases of the read.
        Since Goby 2.0.
    */
    optional bytes read_quality_scores = 55;

    /*
        Origin index. An integer that references a ReadOriginInfo message in 
the alignment header and
        makes it possible to track the origin of the read (especially useful 
after several alignments
        have been merged/concatenated).
        (Since Goby 2.0).
    */
    optional uint32 read_origin_index = 26;
    /*
    Bases that an aligner considered do not belong to the alignment of the read 
to the reference. Potentially
    erroneous bases, or bases that belong to a different part of the reference 
genome. Left clipped bases are
    stored in this field as character bases, or as an equal sign character '=' 
when the clipped base did match
    the reference base. For instance "A=G" for three soft-clipped bases, the 
middle one matching the genome at
    this position. The number of bases in softClippedBasesLeft is exactly equal 
to queryPosition.
    */
    optional string softClippedBasesLeft = 30;
    /*
    Bases that an aligner considered do not belong to the alignment of the read 
to the reference. Potentially
    erroneous bases, or bases that belong to a different part of the reference 
genome. Right clipped bases are
    stored in this field as character bases, or as an equal sign character '=' 
when the clipped base did match
    the reference base. The number of bases in softClippedBasesRight is exactly 
equal
    to  queryLength - queryAlignedLength - queryPosition.
    */
    optional string softClippedBasesRight = 31;

    /*
    Quality scores for bases in softClippedBasesLeft.  Stored in Phred Units.
    */
    optional bytes softClippedQualityLeft = 32;
   /*
    Quality scores for bases in softClippedBasesRight.  Stored in Phred Units.
    */
    optional bytes softClippedQualityRight = 33;
    /*
     Sequence for a read placed near this entry, but unmapped to the reference 
sequence. For instance, used to record
     the sequence of a mate that did not map to the reference. We know that the 
mate maps in the proximity of this entry
     (it is placed) but are unable to map it to a specific genomic position. 
The sequence is always given as obtained
     from the reads file.
    */
    optional string placedUnmappedSequence=40;
    /*
    Quality scores for a read placed near this entry.  Phred units.
    */
    optional bytes placedUnmappedQuality=41;

    /*
    Read name. In SAM/BAM this is referred to as QNAME. Paired and segmented 
reads will have the same Read name.
    */
    optional string readName=42;
}

/* A link to another alignment entry. This message type is used to represent 
relations
   between alignments, such as the relation between the two read fragments in a 
paired-end protocol,
   or the relation between parts of reads that align through an exon exon 
junction and map in
   different locations of the genome.
  */
message RelatedAlignmentEntry {
    /* Target index of the location where the other alignment entry is mapped.
      This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 target_index = 1;

    /* Position on the reference where the other alignment entry is mapped. *
       This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 position = 2;

    /* Index of the fragment for the related alignment entry. This index
       makes it possible to identify which of the read fragments mapped to the 
given
       location is related to the source alignment entry.
       This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 fragment_index = 3;

    optional uint32 optimized_index=50;
}

/*
   Represents sequence variations between the query and the reference 
sequences. Many variations can be represented.
   For instance, an insertion at position 5 in the reference would be 
represented as from="A", to="" position=5.
   A mutation T->G at position 6 would be rendered as from="T", to="G" 
position=6. Padded alignments (see SAM description)
   can be described by a combination of pair-wise alignments, where the gap 
character '-' is used to indicate that no
   base exists in the sequence considered for the alignment position, for 
instance:

   - Padding example:

    123 (<-positions)
ref A-C
    A-T [from="-" to=""  position=2] [from="C" to="T"  position=3]
    ACT [from=""  to="C" position=2] [from="C" to="T"  position=3]
    A-T [from="-" to=""  position=2] [from="C" to="T"  position=3]

   - Mutation example:
    123 (<-positions)
ref ATT
    ACT [from="T"  to="C" position=2]

    -- Example of deletion in a read:
    123 (<-positions)
ref ATT
    A-T [from="T"  to="-" position=2]

    -- Example of insertion of two base pairs in a read:
    12345 (<-positions)
ref A--TT
    ACCTT [from=""  to="CC" position=2]

  */
message SequenceVariation {
    /* The reference bases. Can include one or more gap characters '-', to 
indicate that the reference sequence has
     no base at this alignment position.
     This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional string from = 2;
    /* The read bases that differ from the reference sequence.  Can include one 
or more gap characters '-', to indicate
     that the query sequence has no base at this alignment position.
     This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional string to = 1;
    /*
    The position of the variation on the read, as if the read always matched on 
the forward strand.
    Adding position to the index where the reference starts aligning the read 
yields the position of the variation
    in reference/target sequence space. Since position starts at one the 
resulting position will also be one based.
    This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 position = 3;
    /*
    The position of the variation, starting from the beginning of the aligned 
read (position 1), and up to the length
    of the read (inclusive). Use this index if you need to know  how far the 
variation is observed from the beginning
    of the sequenced read. When the read has an insertion, this index records 
the position immediately before the base
    where the bases are inserted (these bases are in the to field).
    When the read has a deletion, read_index records the position in the read 
after which the bases that would align
    in the reference are missing (these bases are in the from field).
    This field is required (enforced by semantic validation in Goby 2.0+).
    */
    optional uint32 read_index = 5;

    /**
      The read base quality scores for those bases that are given in the to 
field. This field
      is populated when the reads used to perform the search include quality 
scores, and when
      the alignment parser can extract the information from the aligner's 
output.
      (this option is currently not implemented in Goby.)
    */
    optional bytes to_quality = 4;

}
/*
  This message is written to 'basename'.header
*/

message AlignmentHeader {
    /*
     The smallest possible query index in this alignment. Data stored as an 
array where
     queryIndex is the array index will be stored with only the elements in the 
inclusive
     range [smallestSplitQueryIndex largestSplitQueryIndex]
     Such data structures include queryLength and some arrays in the 
TooManyHits data
     structure.
    */
    optional uint32 smallest_split_query_index = 9;
    /*
     The largest possible query index in this alignment. Data stored as an 
array where
     queryIndex is the array index will be stored with only the elements in the 
inclusive
     range [smallestSplitQueryIndex largestSplitQueryIndex]
     Such data structures include queryLength and some arrays in the 
TooManyHits data
     structure.
    */
    optional uint32 largest_split_query_index = 11;

    /* Mapping from query identifier name to query index (as used in alignment 
entries).
    */
    optional IdentifierMapping query_name_mapping = 1;

    /* Mapping from target identifier name to target index (as used in 
alignment entries).
    */
    optional IdentifierMapping target_name_mapping = 2;

    /*
     The number of query sequences
    */
    optional uint32 number_of_queries = 5;
    /*
      The number of target sequences
    */
    optional uint32 number_of_targets = 6;
    /*
      The number of reads that were aligned to the reference and are 
represented in this alignment archive.
    */
    optional uint32 number_of_aligned_reads = 7;

    /*
      Length of the query sequences. One number per query, in the order of 
increasing query index.
      This information has been moved to the individual alignment entries.
    */
    repeated uint32 query_length = 3 [deprecated = true];
    /*
       If query length is constant across all the queries, this field contains 
the constant length.
       In such cases, query_length will be empty.
    */
    optional uint32 constant_query_length = 10;

    /*
      Length of the target sequences. One number per target, in the order of 
increasing target index.
      The target indexes must be 0..(number of targets - 1).
    */
    repeated uint32 target_length = 8;
    /*
       Indicates whether this alignment is sorted by position. True: the 
alignment entries occur in sorted
       order, such that entry a occurs before entry b if a.targetIndex< 
b.targetIndex or, when entries
       have the same target, when a.position < b.position.
    */
    optional bool sorted = 13;

    /*
       Indicates whether this alignment is indexed by position. When this 
attribute is true, a file called
      'basename'.index exists that contains the AlignmentIndex message (GZip 
compressed).
    */
    optional bool indexed = 14;
    /*
      True when query lengths are stored in alignment entries (Goby 1.7+).
    */
    optional bool query_lengths_stored_in_entries = 15;
    /*
      Name of the aligner that produced this alignment.
    */
    optional string aligner_name = 17;
    /*
      Version number for the aligner implementation that produced this 
alignment.
    */
    optional string aligner_version = 18;
    /*
       The version of Goby that created this alignment file.
    */
    optional string version = 25;

    /*
      Sample basenames, in the order of increasing sampleIndex, starting with 
sampleIndex=0.
    */

    repeated string sample_basename = 30;

    /*
       This field is true when the query indices of alignment entries were 
permuted to smaller indices. Only sorted
       alignments can have query_indices_were_permuted=true. When the field is 
true, and you need to retrieve the
       original query-index of an alignment (because you want to retrieve the 
specific read(s) from a read file for
       instance), you will need the information in the permutation file 
(extension basename.perm) and transform back
       each small index of interest to the original query index.
    */
    optional bool query_indices_were_permuted = 26;
    /*
       This field is true when entries in the alignment .entries file all have 
the query_index_occurrences field populated
       (Since Goby 2.0).
    */
    optional bool query_index_occurrences = 35;

    /*
       This field is true when entries in the alignment .entries file all have 
the ambiguity field populated
       (Since Goby 2.0).
    */
    optional bool ambiguity_stored_in_entries = 36;
    /*
       This field is true when entries in the alignment .entries file all have 
the read_quality_score field populated.
       (Since Goby 2.0).
    */
    optional bool all_read_quality_scores = 40;
    /*
      A description of the origin of sets of reads. Serves a similar function 
to BAM read groups, but more flexible and
      efficient. Instead of storing strings, we use integers in the entries.
      Alignemnt entries will link to a specific ReadOriginInfo with the 
origin_index field.
      (Since Goby 2.0).
    */
    repeated ReadOriginInfo read_origin = 27;
}

message IdentifierMapping {
    repeated IdentifierInfo mappings = 1;
}

message IdentifierInfo {
    required string name = 1;
    required uint32 index = 2;
}


/*
     A description of the origin of sets of reads. Stored in the Goby alignment 
header and linked
     from alignment entries. Goby makes it possible to adapt origin equivalence 
rules on the fly
     efficiently. To do this, it is sufficient to read the header of the 
alignment, decide which
     ReadOriginInfo instances are equivalent (e.g., by looking at sample, 
platform, library, or
     other fields in the message), then construct a function e(a):int. This 
function takes
     one originIndex parameter and returns another integer that maps to an 
equivalent class. The
     equivalence class can be used to estimate error models for entries that 
belong to each class,
     for instance.
     (Since Goby 2.0).
 */
message ReadOriginInfo {
    /*
       Origin index. An integer that links alignment entries to their origin 
information.
    */
    required uint32 origin_index = 1;
    /*
       Identifier that describes the origin of the reads. This field is 
compatible with the ID/platform field of BAM read
       groups. Free text.
    */
    required string origin_id = 2;
    /*
       The sample from which the reads were sequenced. This field is compatible 
with the SM/sample field of BAM read
       groups. Free text.
    */
    optional string sample = 4;
    /*
       The platform on which the reads were sequenced. This field is compatible 
with the PL/platform field of BAM read
       groups. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.
    */
    optional string platform = 5;
    /*
       The library from which the reads were sequenced. This field is 
compatible with the LB/library field of BAM read
       groups. Free text.
    */
    optional string library = 8;
    /*
       The platform unit on which the reads were sequenced. This field for 
compatibility with samtools.
    */
    optional string platform_unit = 12;
    /*
       The date the reads were sequenced. Useful to identify batch effects, in 
the format dd:MMM:yyyy.
       The month is Jan, Feb, etc. to avoid all confusion with days when 
day<=12.
    */
    optional string run_date = 6;
}

/*
  This message is written to 'basename'.tmh
*/

message AlignmentTooManyHits {
    /*
    The threshold used by the aligner to determine that a query is ambiguous 
and should be dropped.
    Referred to as parameter k below.
    */
    required uint32 aligner_threshold = 2;
    /*
     The hits that are assigned to several (>k) reference location.
    */
    repeated AmbiguousLocation hits = 1;

}

message AmbiguousLocation {
    /*
     The index of the query that matched too many times.
    */
    required uint32 query_index = 1;
    /*
     The number of hits that triggered membership in the too many hits list. 
The query may hit more
     locations than reported here, since some alignment tools will just drop 
queries that match above
     a threshold and stop counting. This number can be >=k.
    */
    required uint32 at_least_number_of_hits = 2;
    /**
The length of the part of the query sequence that could be matched to the 
target (also called depth).
May be less than the length of the query sequence, in which case the match was 
not perfect. When merging
alignments produced by searching different reference sequences, consider only 
at_least_number_of_hits
from alignments that have exactly the longer depth for the query. */
    optional uint32 length_of_match = 3;
}

/*
      This message is written to 'basename'.index
  */
message AlignmentIndex {
    /*
      Stores one element by target sequence. Each element is the cumulative 
target length for the target
      stored at index i. Assume there are four target sequences, with lengths 
{10, 12, 15, 34}. The field
      targetPositionOffsets will contain: {0,10,22,37}. Such offsets can be 
used to calculate the absolute
      position of a genomic location. Given targetIndex and 
positionOnReference, the absolute location
      is defined as  targetPositionOffsets[targetIndex]+positionOnReference.
    */
    repeated uint32 target_position_offsets = 1 [packed = true];
    /*
     The byte offsets into the compressed entries file. Byte offsets are 
matched with absolute position
     by index. There should be as many elements in offsets as there are in 
absolutePosition
     where chunks start which represent entries whose absolute positions are 
less than
    */
    repeated uint64 offsets = 2 [packed = true];
    /*
      The absolute positions of the first entry in the chunk that immediately 
start at offset. One element
      per chunk in the 'basename'.entries file.
    */
    repeated uint64 absolute_positions = 3 [packed = true];

}



1.1                  
sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch

file : 
http://sources.gentoo.org/viewvc.cgi/gentoo-x86/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch?rev=1.1&view=markup
plain: 
http://sources.gentoo.org/viewvc.cgi/gentoo-x86/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch?rev=1.1&content-type=text/plain

Index: goby-cpp-2.0.1-underlinking.patch
===================================================================
 src/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile.am b/src/Makefile.am
index 1033382..33ca906 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -84,7 +84,7 @@ GobyReadsStats_LDADD = libgoby.la ${BOOST_LDFLAGS} 
${BOOST_SYSTEM_LIB} ${BOOST_D
 GobyReadsStats_SOURCES = \
        GobyReadsStats.cc
 
-GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} 
${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB}
+GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} 
${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB} -lz
 GobyFastaToCompact_SOURCES = \
        GobyFastaToCompact.cc
 



1.1                  sci-biology/goby-cpp/files/Reads.proto

file : 
http://sources.gentoo.org/viewvc.cgi/gentoo-x86/sci-biology/goby-cpp/files/Reads.proto?rev=1.1&view=markup
plain: 
http://sources.gentoo.org/viewvc.cgi/gentoo-x86/sci-biology/goby-cpp/files/Reads.proto?rev=1.1&content-type=text/plain

Index: Reads.proto
===================================================================
package goby;

option java_package = "edu.cornell.med.icb.goby.reads";
option optimize_for = SPEED;

message ReadCollection {
     repeated ReadEntry reads = 1;
}

message ReadEntry {
  /*
    Index of a read.
  */
  required uint32 read_index = 1;
   /*
    Index of the barcode, if any.
  */
  optional uint32 barcode_index = 10;
  /*
     Read identifier/name may be present.
  */
  optional string read_identifier = 23;
  /*
     Additional description about the read (from Fasta/Q format).
   */
  optional string description = 22;
  /*
    Length of the sequence.
   */
  required uint32 read_length = 2;
  /*
    Sequence, encoded as ascii characters stored in single bytes.
   */
  optional bytes sequence = 3;
  /*
   The second sequence in a pair. Stored the same way as the sequence attribute.
  */
  optional bytes sequence_pair = 5;
  /*
    Length of the second sequence in a pair.
  */
  optional uint32 read_length_pair = 6;
  /*
    Quality scores in Phred units, stored as single bytes (0-255).
  */
  optional bytes quality_scores = 4;
  /*
    Quality scores for the second sequence in a pair. Stored as the 
'qualityScores' attribute.
   */
  optional bytes quality_scores_pair = 7;
  /*
    Compressed stream of data. The first byte indicates the 
compression/decompression method (codec). The remaining bytes are
    content compressed with the codec.
  */
  optional bytes compressed_data = 8;
  /*
     Stores meta-data about the reads. Typically meta-data is stored in the 
very first read of a
     read collection, with the understanding that the meta-data applies to all 
the reads in the
     collection. Meta-data can be used to store information about when the 
sample was sequenced,
     or other information of interest. The key-value pair format is 
sufficiently flexible to
     accomodate a variety of needs. The following keys are pre-defined. Please 
use pre-defined
     keys so that automated tools can use metadata in relatively standard way. 
Please note that
     some keys provide a format for the value. This format should also be 
followed to garantee
     that meta data can be used computationally in fully automatic manner.

     key="sequencing-run-start-date" value="MM/DD/YYYY" Used to record when the 
sequencing run
     was initiated on the instrument. Can be used to detect batch effect in a 
large set of samples.
     key="platform" value="<free-text>". Value is free text, but the following 
terms are pre-defined.
      Illumina GaIIx
      Illumina HiSeq 1000
      Illumina HiSeq 2000
      Helicos Heliscope
      LifeTech 5500 SOLiD
      LifeTech 5500xl SOLiD
      Roche 454 GS FLX Ti

      key="organism" value="species name"
      Since Goby 1.9.1
  */
  repeated MetaData meta_data = 25;

}
/*
 A message to store a key/value pair and represent metadata about reads.
 Since Goby 1.9.1
 */
message MetaData {
 /*
   Provides the key. See examples in the documentation of meta_data for 
ReadEntry.
 */
 required string key=1;
 /*
   Describes the value associated with the key. See examples in the 
documentation of meta_data for ReadEntry.
 */
 required string value=2;
}




Reply via email to