This is an automated email from the ASF dual-hosted git repository. kriskras99 pushed a commit to branch feat/documentation in repository https://gitbox.apache.org/repos/asf/avro-rs.git
commit 523c54bfedb8efa623d62c92bb2ca2b5abaae5a6 Author: Kriskras99 <[email protected]> AuthorDate: Sun Feb 1 22:46:43 2026 +0100 docs: Rework the documentation [The first page of our documentation](https://docs.rs/apache-avro/0.21.0/apache_avro/) was overwhelmingly large. This commit moves the documentation to several places: - The introduction to Avro itself was moved to `documentation::primer` - Using the crate the "Avro" way was moved to `documentation::generic` - I've also changed the focus to the fact that it allows for more dynamic code - Using the crate the "Serde" way was moved to `serde` - Calculating schema fingerprints was moved to `Schema::fingerprint` - Users can easily find it by searching for fingerprint - Custom name validators was moved to `validator` - Can be found via search and listed under the modules on the first page - Custom schema equality was moved to `schema_equality` - Can be found via search and listed under the modules on the first page It also removes some sections - How to install the library and enable features, this is basic Rust knowledge - Section about breaking changes in minor versions, this is to be expected as the crate is at `0.*` - Reading and writing logical types, this was a very large section but not very useful - Ill formed data, instead the error message was improved by adding a link to the function to change the limit In addition, I've enabled some Clippy lints related to documentation and fixed all the lints it caused. Future work: - Enable the `clippy.missing_errors_doc` and `clippy.missing_panics_doc` lints - Deprecate some types that live in the root namespace - For example, the codec related types. Instead the `codec` module should be public - This will further improve the signal to noise ratio on the first page --- Cargo.toml | 5 + avro/src/bytes.rs | 91 ++-- avro/src/documentation/dynamic.rs | 262 +++++++++++ avro/src/documentation/mod.rs | 7 + avro/src/documentation/primer.rs | 88 ++++ avro/src/duration.rs | 4 +- avro/src/error.rs | 4 +- avro/src/headers.rs | 25 +- avro/src/lib.rs | 917 +------------------------------------- avro/src/rabin.rs | 12 +- avro/src/reader.rs | 12 +- avro/src/schema/mod.rs | 74 ++- avro/src/schema/name.rs | 15 +- avro/src/schema/parser.rs | 42 +- avro/src/schema/union.rs | 2 +- avro/src/schema_compatibility.rs | 94 ++-- avro/src/schema_equality.rs | 52 ++- avro/src/serde/derive.rs | 292 ++++++++++-- avro/src/serde/mod.rs | 92 ++++ avro/src/serde/ser_schema.rs | 41 +- avro/src/serde/with.rs | 28 +- avro/src/types.rs | 8 +- avro/src/util.rs | 19 +- avro/src/validator.rs | 96 ++-- avro/src/writer.rs | 18 +- avro/tests/io.rs | 2 +- avro_derive/build.rs | 2 +- avro_derive/src/case.rs | 16 +- avro_derive/src/lib.rs | 8 + avro_derive/tests/ui.rs | 2 +- avro_test_helper/src/lib.rs | 3 +- 31 files changed, 1143 insertions(+), 1190 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ed46b50..184db25 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,4 +54,9 @@ opt-level = "s" [workspace.lints] # By setting the priority to -1, this lint group can be overridden for specific lints clippy.all = { level = "warn", priority = -1 } +clippy.too_long_first_doc_paragraph = "warn" +clippy.doc_markdown = "warn" +# TODO: Needs more work +#clippy.missing_errors_doc = "warn" +#clippy.missing_panics_doc = "warn" rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(nightly)'] } diff --git a/avro/src/bytes.rs b/avro/src/bytes.rs index 534ce47..341b01d 100644 --- a/avro/src/bytes.rs +++ b/avro/src/bytes.rs @@ -15,20 +15,21 @@ // specific language governing permissions and limitations // under the License. -//! Deprecated. See [`apache_avro::serde::*`] instead. +//! Deprecated. Use [`apache_avro::serde::*`] instead. //! //! [`apache_avro::serde::*`](crate::serde) -// Deprecated. See [`apache_avro::serde::bytes`] instead. -// -// [`apache_avro::serde::bytes`](crate::serde::bytes) +#[cfg(doc)] +use crate as apache_avro; + +/// Deprecated. Use [`apache_avro::serde::bytes`] instead. #[deprecated(since = "0.22.0", note = "Use `apache_avro::serde::bytes` instead")] pub mod serde_avro_bytes { + #[cfg(doc)] + use crate as apache_avro; use serde::{Deserializer, Serializer}; - // Deprecated. See [`apache_avro::serde::bytes::serialize`] instead. - // - // [`apache_avro::serde::bytes::serialize`](crate::serde::bytes::serialize) + /// Deprecated. Use [`apache_avro::serde::bytes::serialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::bytes::serialize` instead" @@ -40,9 +41,7 @@ pub mod serde_avro_bytes { crate::serde::bytes::serialize(bytes, serializer) } - // Deprecated. See [`apache_avro::serde::bytes::deserialize`] instead. - // - // [`apache_avro::serde::bytes::deserialize`](crate::serde::bytes::deserialize) + /// Deprecated. Use [`apache_avro::serde::bytes::deserialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::bytes::deserialize` instead" @@ -55,17 +54,17 @@ pub mod serde_avro_bytes { } } -// Deprecated. See [`apache_avro::serde::bytes_opt`] instead. -// -// [`apache_avro::serde::bytes_opt`](crate::serde::bytes_opt) +/// Deprecated. Use [`apache_avro::serde::bytes_opt`] instead. +/// +/// [`apache_avro::serde::bytes_opt`](crate::serde::bytes_opt) #[deprecated(since = "0.22.0", note = "Use `apache_avro::serde::bytes_opt` instead")] pub mod serde_avro_bytes_opt { + #[cfg(doc)] + use crate as apache_avro; use serde::{Deserializer, Serializer}; use std::borrow::Borrow; - // Deprecated. See [`apache_avro::serde::bytes_opt::serialize`] instead. - // - // [`apache_avro::serde::bytes_opt::serialize`](crate::serde::bytes_opt::serialize) + /// Deprecated. Use [`apache_avro::serde::bytes_opt::serialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::bytes_opt::serialize` instead" @@ -78,9 +77,7 @@ pub mod serde_avro_bytes_opt { crate::serde::bytes_opt::serialize(bytes, serializer) } - // Deprecated. See [`apache_avro::serde::bytes_opt::deserialize`] instead. - // - // [`apache_avro::serde::bytes_opt::deserialize`](crate::serde::bytes_opt::deserialize) + /// Deprecated. Use [`apache_avro::serde::bytes_opt::deserialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::bytes_opt::deserialize` instead" @@ -93,16 +90,14 @@ pub mod serde_avro_bytes_opt { } } -// Deprecated. See [`apache_avro::serde::fixed`] instead. -// -// [`apache_avro::serde::fixed`](crate::serde::fixed) +/// Deprecated. Use [`apache_avro::serde::fixed`] instead. #[deprecated(since = "0.22.0", note = "Use `apache_avro::serde::fixed` instead")] pub mod serde_avro_fixed { + #[cfg(doc)] + use crate as apache_avro; use serde::{Deserializer, Serializer}; - // Deprecated. See [`apache_avro::serde::fixed::serialize`] instead. - // - // [`apache_avro::serde::fixed::serialize`](crate::serde::fixed::serialize) + /// Deprecated. Use [`apache_avro::serde::fixed::serialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::fixed::serialize` instead" @@ -114,9 +109,7 @@ pub mod serde_avro_fixed { crate::serde::fixed::serialize(bytes, serializer) } - // Deprecated. See [`apache_avro::serde::fixed::deserialize`] instead. - // - // [`apache_avro::serde::fixed::deserialize`](crate::serde::fixed::deserialize) + /// Deprecated. Use [`apache_avro::serde::fixed::deserialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::fixed::deserialize` instead" @@ -129,17 +122,15 @@ pub mod serde_avro_fixed { } } -// Deprecated. See [`apache_avro::serde::fixed_opt`] instead. -// -// [`apache_avro::serde::fixed_opt`](crate::serde::fixed_opt) +/// Deprecated. Use [`apache_avro::serde::fixed_opt`] instead. #[deprecated(since = "0.22.0", note = "Use `apache_avro::serde::fixed_opt` instead")] pub mod serde_avro_fixed_opt { + #[cfg(doc)] + use crate as apache_avro; use serde::{Deserializer, Serializer}; use std::borrow::Borrow; - // Deprecated. See [`apache_avro::serde::fixed_opt::serialize`] instead. - // - // [`apache_avro::serde::fixed_opt::serialize`](crate::serde::fixed_opt::serialize) + /// Deprecated. Use [`apache_avro::serde::fixed_opt::serialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::fixed_opt::serialize` instead" @@ -152,9 +143,7 @@ pub mod serde_avro_fixed_opt { crate::serde::fixed_opt::serialize(bytes, serializer) } - // Deprecated. See [`apache_avro::serde::fixed_opt::deserialize`] instead. - // - // [`apache_avro::serde::fixed_opt::deserialize`](crate::serde::fixed_opt::deserialize) + /// Deprecated. Use [`apache_avro::serde::fixed_opt::deserialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::fixed_opt::deserialize` instead" @@ -167,16 +156,14 @@ pub mod serde_avro_fixed_opt { } } -// Deprecated. See [`apache_avro::serde::slice`] instead. -// -// [`apache_avro::serde::slice`](crate::serde::slice) +/// Deprecated. Use [`apache_avro::serde::slice`] instead. #[deprecated(since = "0.22.0", note = "Use `apache_avro::serde::slice` instead")] pub mod serde_avro_slice { + #[cfg(doc)] + use crate as apache_avro; use serde::{Deserializer, Serializer}; - // Deprecated. See [`apache_avro::serde::slice::serialize`] instead. - // - // [`apache_avro::serde::slice::serialize`](crate::serde::slice::serialize) + /// Deprecated. Use [`apache_avro::serde::slice::serialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::slice::serialize` instead" @@ -188,9 +175,7 @@ pub mod serde_avro_slice { crate::serde::slice::serialize(bytes, serializer) } - // Deprecated. See [`apache_avro::serde::slice::deserialize`] instead. - // - // [`apache_avro::serde::slice::deserialize`](crate::serde::slice::deserialize) + /// Deprecated. Use [`apache_avro::serde::slice::deserialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::slice::deserialize` instead" @@ -203,17 +188,15 @@ pub mod serde_avro_slice { } } -// Deprecated. See [`apache_avro::serde::slice_opt`] instead. -// -// [`apache_avro::serde::slice_opt`](crate::serde::slice_opt) +/// Deprecated. Use [`apache_avro::serde::slice_opt`] instead. #[deprecated(since = "0.22.0", note = "Use `apache_avro::serde::slice_opt` instead")] pub mod serde_avro_slice_opt { + #[cfg(doc)] + use crate as apache_avro; use serde::{Deserializer, Serializer}; use std::borrow::Borrow; - // Deprecated. See [`apache_avro::serde::slice_opt::serialize`] instead. - // - // [`apache_avro::serde::slice_opt::serialize`](crate::serde::slice_opt::serialize) + /// Deprecated. Use [`apache_avro::serde::slice_opt::serialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::slice_opt::serialize` instead" @@ -226,9 +209,7 @@ pub mod serde_avro_slice_opt { crate::serde::slice_opt::serialize(bytes, serializer) } - // Deprecated. See [`apache_avro::serde::slice_opt::deserialize`] instead. - // - // [`apache_avro::serde::slice_opt::deserialize`](crate::serde::slice_opt::deserialize) + /// Deprecated. Use [`apache_avro::serde::slice_opt::deserialize`] instead. #[deprecated( since = "0.22.0", note = "Use `apache_avro::serde::slice_opt::deserialize` instead" diff --git a/avro/src/documentation/dynamic.rs b/avro/src/documentation/dynamic.rs new file mode 100644 index 0000000..0e522c2 --- /dev/null +++ b/avro/src/documentation/dynamic.rs @@ -0,0 +1,262 @@ +//! # Using Avro in Rust, the dynamic way. +//! +//! ## Creating a schema +//! +//! An Avro data cannot exist without an Avro schema. Schemas **must** be used while writing and +//! **can** be used while reading and they carry the information regarding the type of data we are +//! handling. Avro schemas are used for both schema validation and resolution of Avro data. +//! +//! Avro schemas are defined in **JSON** format and can just be parsed out of a raw string: +//! +//! ``` +//! use apache_avro::Schema; +//! +//! let raw_schema = r#" +//! { +//! "type": "record", +//! "name": "test", +//! "fields": [ +//! {"name": "a", "type": "long", "default": 42}, +//! {"name": "b", "type": "string"} +//! ] +//! } +//! "#; +//! +//! // if the schema is not valid, this function will return an error +//! let schema = Schema::parse_str(raw_schema).unwrap(); +//! +//! // schemas can be printed for debugging +//! println!("{:?}", schema); +//! ``` +//! +//! Additionally, a list of definitions (which may depend on each other) can be given and all of +//! them will be parsed into the corresponding schemas. +//! +//! ``` +//! use apache_avro::Schema; +//! +//! let raw_schema_1 = r#"{ +//! "name": "A", +//! "type": "record", +//! "fields": [ +//! {"name": "field_one", "type": "float"} +//! ] +//! }"#; +//! +//! // This definition depends on the definition of A above +//! let raw_schema_2 = r#"{ +//! "name": "B", +//! "type": "record", +//! "fields": [ +//! {"name": "field_one", "type": "A"} +//! ] +//! }"#; +//! +//! // if the schemas are not valid, this function will return an error +//! let schemas = Schema::parse_list(&[raw_schema_1, raw_schema_2]).unwrap(); +//! +//! // schemas can be printed for debugging +//! println!("{:?}", schemas); +//! ``` +//! +//! ## Writing data +//! +//! Once we have defined a schema, we are ready to serialize data in Avro, validating them against +//! the provided schema in the process. As mentioned before, there are two ways of handling Avro +//! data in Rust. +//! +//! Given that the schema we defined above is that of an Avro *Record*, we are going to use the +//! associated type provided by the library to specify the data we want to serialize: +//! +//! ``` +//! # use apache_avro::Schema; +//! use apache_avro::types::Record; +//! use apache_avro::Writer; +//! # +//! # let raw_schema = r#" +//! # { +//! # "type": "record", +//! # "name": "test", +//! # "fields": [ +//! # {"name": "a", "type": "long", "default": 42}, +//! # {"name": "b", "type": "string"} +//! # ] +//! # } +//! # "#; +//! # let schema = Schema::parse_str(raw_schema).unwrap(); +//! // a writer needs a schema and something to write to +//! let mut writer = Writer::new(&schema, Vec::new()).unwrap(); +//! +//! // the Record type models our Record schema +//! let mut record = Record::new(writer.schema()).unwrap(); +//! record.put("a", 27i64); +//! record.put("b", "foo"); +//! +//! // schema validation happens here +//! writer.append_value(record).unwrap(); +//! +//! // this is how to get back the resulting Avro bytecode +//! // this performs a flush operation to make sure data has been written, so it can fail +//! // you can also call `writer.flush()` yourself without consuming the writer +//! let encoded = writer.into_inner().unwrap(); +//! ``` +//! +//! The vast majority of the times, schemas tend to define a record as a top-level container +//! encapsulating all the values to convert as fields and providing documentation for them, but in +//! case we want to directly define an Avro value, the library offers that capability via the +//! `Value` interface. +//! +//! ``` +//! use apache_avro::types::Value; +//! +//! let mut value = Value::String("foo".to_string()); +//! ``` +//! +//! ## Reading data +//! +//! As far as reading Avro encoded data goes, we can just use the schema encoded with the data to +//! read them. The library will do it automatically for us, as it already does for the compression +//! codec: +//! +//! ``` +//! use apache_avro::Reader; +//! # use apache_avro::Schema; +//! # use apache_avro::types::Record; +//! # use apache_avro::Writer; +//! # +//! # let raw_schema = r#" +//! # { +//! # "type": "record", +//! # "name": "test", +//! # "fields": [ +//! # {"name": "a", "type": "long", "default": 42}, +//! # {"name": "b", "type": "string"} +//! # ] +//! # } +//! # "#; +//! # let schema = Schema::parse_str(raw_schema).unwrap(); +//! # let mut writer = Writer::new(&schema, Vec::new()).unwrap(); +//! # let mut record = Record::new(writer.schema()).unwrap(); +//! # record.put("a", 27i64); +//! # record.put("b", "foo"); +//! # writer.append_value(record).unwrap(); +//! # let input = writer.into_inner().unwrap(); +//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed +//! let reader = Reader::new(&input[..]).unwrap(); +//! +//! // value is a Result of an Avro Value in case the read operation fails +//! for value in reader { +//! println!("{:?}", value.unwrap()); +//! } +//! ``` +//! +//! In case, instead, we want to specify a different (but compatible) reader schema from the schema +//! the data has been written with, we can just do as the following: +//! ``` +//! use apache_avro::Schema; +//! use apache_avro::Reader; +//! # use apache_avro::types::Record; +//! # use apache_avro::Writer; +//! # +//! # let writer_raw_schema = r#" +//! # { +//! # "type": "record", +//! # "name": "test", +//! # "fields": [ +//! # {"name": "a", "type": "long", "default": 42}, +//! # {"name": "b", "type": "string"} +//! # ] +//! # } +//! # "#; +//! # let writer_schema = Schema::parse_str(writer_raw_schema).unwrap(); +//! # let mut writer = Writer::new(&writer_schema, Vec::new()).unwrap(); +//! # let mut record = Record::new(writer.schema()).unwrap(); +//! # record.put("a", 27i64); +//! # record.put("b", "foo"); +//! # writer.append_value(record).unwrap(); +//! # let input = writer.into_inner().unwrap(); +//! +//! let reader_raw_schema = r#" +//! { +//! "type": "record", +//! "name": "test", +//! "fields": [ +//! {"name": "a", "type": "long", "default": 42}, +//! {"name": "b", "type": "string"}, +//! {"name": "c", "type": "long", "default": 43} +//! ] +//! } +//! "#; +//! +//! let reader_schema = Schema::parse_str(reader_raw_schema).unwrap(); +//! +//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed +//! let reader = Reader::with_schema(&reader_schema, &input[..]).unwrap(); +//! +//! // value is a Result of an Avro Value in case the read operation fails +//! for value in reader { +//! println!("{:?}", value.unwrap()); +//! } +//! ``` +//! +//! The library will also automatically perform schema resolution while reading the data. +//! +//! For more information about schema compatibility and resolution, please refer to the +//! [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration). +//! +//! ## Putting everything together +//! +//! The following is an example of how to combine everything showed so far and it is meant to be a +//! quick reference of the [`Value`](crate::types::Value) interface: +//! +//! ``` +//! use apache_avro::{Codec, DeflateSettings, Reader, Schema, Writer, from_value, types::Record, Error}; +//! use serde::{Deserialize, Serialize}; +//! +//! #[derive(Debug, Deserialize, Serialize)] +//! struct Test { +//! a: i64, +//! b: String, +//! } +//! +//! fn main() -> Result<(), Error> { +//! let raw_schema = r#" +//! { +//! "type": "record", +//! "name": "test", +//! "fields": [ +//! {"name": "a", "type": "long", "default": 42}, +//! {"name": "b", "type": "string"} +//! ] +//! } +//! "#; +//! +//! let schema = Schema::parse_str(raw_schema)?; +//! +//! println!("{:?}", schema); +//! +//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default())).unwrap(); +//! +//! let mut record = Record::new(writer.schema()).unwrap(); +//! record.put("a", 27i64); +//! record.put("b", "foo"); +//! +//! writer.append_value(record)?; +//! +//! let test = Test { +//! a: 27, +//! b: "foo".to_owned(), +//! }; +//! +//! writer.append_ser(test)?; +//! +//! let input = writer.into_inner()?; +//! let reader = Reader::with_schema(&schema, &input[..])?; +//! +//! for record in reader { +//! println!("{:?}", from_value::<Test>(&record?)); +//! } +//! Ok(()) +//! } +//! ``` +//! diff --git a/avro/src/documentation/mod.rs b/avro/src/documentation/mod.rs new file mode 100644 index 0000000..b451407 --- /dev/null +++ b/avro/src/documentation/mod.rs @@ -0,0 +1,7 @@ +//! # General documentation that does not fit in a particular module +//! +//! This module does not contain any code, and is only available during `rustdoc` builds. +//! + +pub mod dynamic; +pub mod primer; diff --git a/avro/src/documentation/primer.rs b/avro/src/documentation/primer.rs new file mode 100644 index 0000000..7380302 --- /dev/null +++ b/avro/src/documentation/primer.rs @@ -0,0 +1,88 @@ +//! # A primer on Apache Avro +//! +//! Avro is a schema based encoding system, like Protobuf. This means that if you have raw Avro data +//! without a schema, you are unable to decode it. It also means that the format is very space +//! efficient. +//! +//! ## Schemas +//! +//! Schemas are defined in JSON and look like this: +//! ```json +//! { +//! "type": "record", +//! "name": "example", +//! "fields": [ +//! {"name": "a", "type": "long", "default": 42}, +//! {"name": "b", "type": "string"} +//! ] +//! } +//! ``` +//! For all possible types and extra attributes, see [the schema section of the specification]. +//! +//! Schemas can depend on each other. For example, the schema defined above can be used again or a +//! schema can include itself: +//! ```json +//! { +//! "type": "record", +//! "name": "references", +//! "fields": [ +//! {"name": "a", "type": "example"}, +//! {"name": "b", "type": "bytes"}, +//! {"name": "recursive", "type": ["null", "references"]} +//! ] +//! } +//! ``` +//! +//! Schemas are represented using the [`Schema`](crate::Schema) type. +//! +//! [the schema section of the specification]: https://avro.apache.org/docs/++version++/specification/#schema-declaration +//! +//! ## Data serialization and deserialization +//! There are various formats to encode and decode Avro data. Most formats use the Avro binary encoding. +//! +//! #### [Object Container File](https://avro.apache.org/docs/++version++/specification/#object-container-files) +//! This is the most common file format used for Avro, it uses the binary encoding. It includes the +//! schema in the file, and can therefore be decoded by a reader who doesn't have the schema. It includes +//! many records in one file. +//! +//! This file format can be used via the [`Reader`](crate::Reader) and [`Writer`](crate::Writer) types. +//! +//! #### [Single Object Encoding](https://avro.apache.org/docs/++version++/specification/#single-object-encoding) +//! This file format also uses the binary encoding, but the schema is not included directly. It instead +//! includes a fingerprint of the schema, which a reader can look up in a schema database or compare +//! with the fingerprint that the reader is expecting. This file format always contains one record. +//! +//! This file format can be used via the [`GenericSingleObjectReader`](crate::GenericSingleObjectReader), +//! [`GenericSingleObjectWriter`](crate::GenericSingleObjectReader), [`SpecificSingleObjectReader`](crate::SpecificSingleObjectReader), +//! and [`SpecificSingleObjectWriter`](crate::SpecificSingleObjectWriter) types. +//! +//! #### Avro datums +//! This is not really a file format, as it's just the raw Avro binary data. It does not include a +//! schema and can therefore not be decoded without the reader knowing **exactly** which schema was +//! used to write it. +//! +//! This file format can be used via the [`to_avro_datum`](crate::to_avro_datum), [`from_avro_datum`](crate::from_avro_datum), +//! [`to_avro_datum_schemata`](crate::to_avro_datum_schemata), [`from_avro_datum_schemata`](crate::from_avro_datum_schemata), +//! [`from_avro_datum_reader_schemata`](crate::from_avro_datum_reader_schemata), and +//! [`write_avro_datum_ref`](crate::write_avro_datum_ref) functions. +//! +//! #### [Avro JSON](https://avro.apache.org/docs/++version++/specification/#json-encoding) +//! Not be confused with the schema definition which is also in JSON. This is the Avro data encoded +//! in JSON. +//! +//! It can be used via the [`From<serde_json::Value> for Value`](crate::types::Value) and +//! [`TryFrom<Value> for serde_json::Value`](crate::types::Value) implementations. +//! +//! ## Compression +//! For records with low entropy it can be useful to compress the encoded data. Using the [#Object Container File] +//! this is directly possible in Avro. Avro supports various compression codecs: +//! +//! - deflate +//! - bzip2 +//! - Snappy +//! - XZ +//! - Zstandard +//! +//! All readers are required to implement the `deflate` codec, but most implementations implement most +//! codecs. +//! diff --git a/avro/src/duration.rs b/avro/src/duration.rs index cf24bc0..eecfca1 100644 --- a/avro/src/duration.rs +++ b/avro/src/duration.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -/// A struct representing duration that hides the details of endianness and conversion between -/// platform-native u32 and byte arrays. use serde::{Deserialize, Serialize, de}; +/// A struct representing duration that hides the details of endianness and conversion between +/// platform-native u32 and byte arrays. #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub struct Duration { months: Months, diff --git a/avro/src/error.rs b/avro/src/error.rs index cbb00c0..4bd6a25 100644 --- a/avro/src/error.rs +++ b/avro/src/error.rs @@ -554,11 +554,11 @@ pub enum Details { #[error("Failed to convert JSON to string: {0}")] ConvertJsonToString(#[source] serde_json::Error), - /// Error while converting float to json value + /// Error while converting float to JSON value #[error("failed to convert avro float to json: {0}")] ConvertF64ToJson(f64), - /// Error while resolving Schema::Ref + /// Error while resolving [`Schema::Ref`] #[error("Unresolved schema reference: {0}")] SchemaResolutionError(Name), diff --git a/avro/src/headers.rs b/avro/src/headers.rs index dce134f..8a97f7b 100644 --- a/avro/src/headers.rs +++ b/avro/src/headers.rs @@ -20,17 +20,18 @@ use uuid::Uuid; use crate::{AvroResult, Schema, rabin::Rabin, schema::SchemaFingerprint}; -/// This trait represents that an object is able to construct an Avro message header. It is -/// implemented for some known header types already. If you need a header type that is not already -/// included here, then you can create your own struct and implement this trait. +/// This trait represents that an object is able to construct an Avro message header. +/// +/// It is implemented for some known header types already. If you need a header type that is not +/// already included here, then you can create your own struct and implement this trait. pub trait HeaderBuilder { fn build_header(&self) -> Vec<u8>; } -/// HeaderBuilder based on the Rabin schema fingerprint +/// [`HeaderBuilder`] based on the Rabin schema fingerprint. /// /// This is the default and will be used automatically by the `new` impls in -/// [crate::reader::GenericSingleObjectReader] and [crate::writer::GenericSingleObjectWriter]. +/// [`GenericSingleObjectReader`](crate::GenericSingleObjectReader) and [`GenericSingleObjectWriter`](crate::GenericSingleObjectWriter). pub struct RabinFingerprintHeader { fingerprint: SchemaFingerprint, } @@ -53,16 +54,15 @@ impl HeaderBuilder for RabinFingerprintHeader { } } -/// HeaderBuilder based on -/// [Glue](https://docs.aws.amazon.com/glue/latest/dg/what-is-glue.html) schema UUID +/// [`HeaderBuilder`] for [Glue](https://docs.aws.amazon.com/glue/latest/dg/what-is-glue.html). /// -/// See the function docs for usage details +/// See the function docs for usage details. pub struct GlueSchemaUuidHeader { schema_uuid: Uuid, } impl GlueSchemaUuidHeader { - /// Create an instance of the struct from a Glue Schema UUID + /// Create an instance of the struct from a Glue Schema UUID. /// /// Code for writing messages will most likely want to use this. You will need to determine /// via other means the correct Glue schema UUID and use it with this method to be able to @@ -72,12 +72,11 @@ impl GlueSchemaUuidHeader { } /// The minimum length of a Glue header. - /// 2 bytes for the special prefix (3, 0) plus - /// 16 bytes for the Uuid + /// + /// 2 bytes for the special prefix (3, 0) plus 16 bytes for the Uuid. const GLUE_HEADER_LENGTH: usize = 18; - /// Create an instance of the struct based on parsing the UUID out of the header of a raw - /// message + /// Create an instance of the struct based on parsing the UUID out of the header of a raw message /// /// Code for reading messages will most likely want to use this. Once you receive the raw bytes /// of a message, use this function to build the struct from it. That struct can then be used diff --git a/avro/src/lib.rs b/avro/src/lib.rs index d7aece5..75bd7f4 100644 --- a/avro/src/lib.rs +++ b/avro/src/lib.rs @@ -32,912 +32,35 @@ //! Please check our [documentation](https://docs.rs/apache-avro) for examples, tutorials and API reference. //! //! **[Apache Avro](https://avro.apache.org/)** is a data serialization system which provides rich -//! data structures and a compact, fast, binary data format. +//! data structures and a compact, fast, binary data format. If you are not familiar with the data +//! format, please read [`documentation::primer`] first. //! -//! All data in Avro is schematized, as in the following example: +//! There are two ways of working with Avro data in this crate: //! -//! ```json -//! { -//! "type": "record", -//! "name": "test", -//! "fields": [ -//! {"name": "a", "type": "long", "default": 42}, -//! {"name": "b", "type": "string"} -//! ] -//! } -//! ``` +//! 1. Via the generic [`Value`](types::Value) type. +//! 2. Via types implementing [`Serialize`](::serde::Serialize), [`Deserialize`](::serde::Deserialize), and [`AvroSchema`]. //! -//! There are basically two ways of handling Avro data in Rust: +//! The first option is great for dealing with Avro data in a dynamic way. For example, when working +//! with unknown or rapidly changing schemas, when you don't want to map to Rust types, or .... The +//! module documentation of [`documentation::dynamic`] explains how to work in this dynamic way. //! -//! * **as Avro-specialized data types** based on an Avro schema; -//! * **as generic Rust serde-compatible types** implementing/deriving `Serialize` and `Deserialize`; +//! The second option is great when dealing with static schemas that should be decoded to and encoded +//! from Rust types. The module documentation of [`serde`] explains how to work in this static way. //! -//! **apache-avro** provides a way to read and write both these data representations easily and -//! efficiently. +//! # Features //! -//! # Installing the library +//! - `derive`: enable support for deriving [`AvroSchema`] +//! - `snappy`: enable support for the Snappy codec +//! - `zstandard`: enable support for the Zstandard codec +//! - `bzip`: enable support for the Bzip2 codec +//! - `xz`: enable support for the Xz codec //! +//! # MSRV //! -//! Add to your `Cargo.toml`: +//! The current MSRV is 1.88.0. //! -//! ```toml -//! [dependencies] -//! apache-avro = "x.y" -//! ``` +//! The MSRV may be bumped in minor releases. //! -//! Or in case you want to leverage the **Snappy** codec: -//! -//! ```toml -//! [dependencies.apache-avro] -//! version = "x.y" -//! features = ["snappy"] -//! ``` -//! -//! Or in case you want to leverage the **Zstandard** codec: -//! -//! ```toml -//! [dependencies.apache-avro] -//! version = "x.y" -//! features = ["zstandard"] -//! ``` -//! -//! Or in case you want to leverage the **Bzip2** codec: -//! -//! ```toml -//! [dependencies.apache-avro] -//! version = "x.y" -//! features = ["bzip"] -//! ``` -//! -//! Or in case you want to leverage the **Xz** codec: -//! -//! ```toml -//! [dependencies.apache-avro] -//! version = "x.y" -//! features = ["xz"] -//! ``` -//! -//! # Upgrading to a newer minor version -//! -//! The library is still in beta, so there might be backward-incompatible changes between minor -//! versions. If you have troubles upgrading, check the release notes. -//! -//! # Minimum supported Rust version -//! -//! 1.88.0 -//! -//! # Defining a schema -//! -//! An Avro data cannot exist without an Avro schema. Schemas **must** be used while writing and -//! **can** be used while reading and they carry the information regarding the type of data we are -//! handling. Avro schemas are used for both schema validation and resolution of Avro data. -//! -//! Avro schemas are defined in **JSON** format and can just be parsed out of a raw string: -//! -//! ``` -//! use apache_avro::Schema; -//! -//! let raw_schema = r#" -//! { -//! "type": "record", -//! "name": "test", -//! "fields": [ -//! {"name": "a", "type": "long", "default": 42}, -//! {"name": "b", "type": "string"} -//! ] -//! } -//! "#; -//! -//! // if the schema is not valid, this function will return an error -//! let schema = Schema::parse_str(raw_schema).unwrap(); -//! -//! // schemas can be printed for debugging -//! println!("{:?}", schema); -//! ``` -//! -//! Additionally, a list of of definitions (which may depend on each other) can be given and all of -//! them will be parsed into the corresponding schemas. -//! -//! ``` -//! use apache_avro::Schema; -//! -//! let raw_schema_1 = r#"{ -//! "name": "A", -//! "type": "record", -//! "fields": [ -//! {"name": "field_one", "type": "float"} -//! ] -//! }"#; -//! -//! // This definition depends on the definition of A above -//! let raw_schema_2 = r#"{ -//! "name": "B", -//! "type": "record", -//! "fields": [ -//! {"name": "field_one", "type": "A"} -//! ] -//! }"#; -//! -//! // if the schemas are not valid, this function will return an error -//! let schemas = Schema::parse_list(&[raw_schema_1, raw_schema_2]).unwrap(); -//! -//! // schemas can be printed for debugging -//! println!("{:?}", schemas); -//! ``` -//! *N.B.* It is important to note that the composition of schema definitions requires schemas with names. -//! For this reason, only schemas of type Record, Enum, and Fixed should be input into this function. -//! -//! The library provides also a programmatic interface to define schemas without encoding them in -//! JSON (for advanced use), but we highly recommend the JSON interface. Please read the API -//! reference in case you are interested. -//! -//! For more information about schemas and what kind of information you can encapsulate in them, -//! please refer to the appropriate section of the -//! [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration). -//! -//! # Writing data -//! -//! Once we have defined a schema, we are ready to serialize data in Avro, validating them against -//! the provided schema in the process. As mentioned before, there are two ways of handling Avro -//! data in Rust. -//! -//! **NOTE:** The library also provides a low-level interface for encoding a single datum in Avro -//! bytecode without generating markers and headers (for advanced use), but we highly recommend the -//! `Writer` interface to be totally Avro-compatible. Please read the API reference in case you are -//! interested. -//! -//! ## The avro way -//! -//! Given that the schema we defined above is that of an Avro *Record*, we are going to use the -//! associated type provided by the library to specify the data we want to serialize: -//! -//! ``` -//! # use apache_avro::Schema; -//! use apache_avro::types::Record; -//! use apache_avro::Writer; -//! # -//! # let raw_schema = r#" -//! # { -//! # "type": "record", -//! # "name": "test", -//! # "fields": [ -//! # {"name": "a", "type": "long", "default": 42}, -//! # {"name": "b", "type": "string"} -//! # ] -//! # } -//! # "#; -//! # let schema = Schema::parse_str(raw_schema).unwrap(); -//! // a writer needs a schema and something to write to -//! let mut writer = Writer::new(&schema, Vec::new()).unwrap(); -//! -//! // the Record type models our Record schema -//! let mut record = Record::new(writer.schema()).unwrap(); -//! record.put("a", 27i64); -//! record.put("b", "foo"); -//! -//! // schema validation happens here -//! writer.append_value(record).unwrap(); -//! -//! // this is how to get back the resulting avro bytecode -//! // this performs a flush operation to make sure data has been written, so it can fail -//! // you can also call `writer.flush()` yourself without consuming the writer -//! let encoded = writer.into_inner().unwrap(); -//! ``` -//! -//! The vast majority of the times, schemas tend to define a record as a top-level container -//! encapsulating all the values to convert as fields and providing documentation for them, but in -//! case we want to directly define an Avro value, the library offers that capability via the -//! `Value` interface. -//! -//! ``` -//! use apache_avro::types::Value; -//! -//! let mut value = Value::String("foo".to_string()); -//! ``` -//! -//! ## The serde way -//! -//! Given that the schema we defined above is an Avro *Record*, we can directly use a Rust struct -//! deriving `Serialize` to model our data: -//! -//! ``` -//! # use apache_avro::Schema; -//! # use serde::Serialize; -//! use apache_avro::Writer; -//! -//! #[derive(Debug, Serialize)] -//! struct Test { -//! a: i64, -//! b: String, -//! } -//! -//! # let raw_schema = r#" -//! # { -//! # "type": "record", -//! # "name": "test", -//! # "fields": [ -//! # {"name": "a", "type": "long", "default": 42}, -//! # {"name": "b", "type": "string"} -//! # ] -//! # } -//! # "#; -//! # let schema = Schema::parse_str(raw_schema).unwrap(); -//! // a writer needs a schema and something to write to -//! let mut writer = Writer::new(&schema, Vec::new()).unwrap(); -//! -//! // the structure models our Record schema -//! let test = Test { -//! a: 27, -//! b: "foo".to_owned(), -//! }; -//! -//! // schema validation happens here -//! writer.append_ser(test).unwrap(); -//! -//! // this is how to get back the resulting avro bytecode -//! // this performs a flush operation to make sure data is written, so it can fail -//! // you can also call `writer.flush()` yourself without consuming the writer -//! let encoded = writer.into_inner(); -//! ``` -//! -//! ### Importance of the fields' order -//! -//! *Important*: The order of the fields in the struct must match the order of the fields in the Avro schema! -//! -//! ### Simple types -//! -//! The vast majority of the times, schemas tend to define a record as a top-level container -//! encapsulating all the values to convert as fields and providing documentation for them, but in -//! case we want to directly define an Avro value, any type implementing `Serialize` should work. -//! -//! ``` -//! let value = "foo".to_string(); -//! ``` -//! -//! ## Using codecs to compress data -//! -//! Avro supports three different compression codecs when encoding data: -//! -//! * **Null**: leaves data uncompressed; -//! * **Deflate**: writes the data block using the deflate algorithm as specified in RFC 1951, and -//! typically implemented using the zlib library. Note that this format (unlike the "zlib format" in -//! RFC 1950) does not have a checksum. -//! * **Snappy**: uses Google's [Snappy](http://google.github.io/snappy/) compression library. Each -//! compressed block is followed by the 4-byte, big-endianCRC32 checksum of the uncompressed data in -//! the block. You must enable the `snappy` feature to use this codec. -//! * **Zstandard**: uses Facebook's [Zstandard](https://facebook.github.io/zstd/) compression library. -//! You must enable the `zstandard` feature to use this codec. -//! * **Bzip2**: uses [BZip2](https://sourceware.org/bzip2/) compression library. -//! You must enable the `bzip` feature to use this codec. -//! * **Xz**: uses [liblzma](https://github.com/portable-network-archive/liblzma-rs) compression library. -//! You must enable the `xz` feature to use this codec. -//! -//! To specify a codec to use to compress data, just specify it while creating a `Writer`: -//! ``` -//! use apache_avro::{Codec, DeflateSettings, Schema, Writer}; -//! # -//! # let raw_schema = r#" -//! # { -//! # "type": "record", -//! # "name": "test", -//! # "fields": [ -//! # {"name": "a", "type": "long", "default": 42}, -//! # {"name": "b", "type": "string"} -//! # ] -//! # } -//! # "#; -//! # let schema = Schema::parse_str(raw_schema).unwrap(); -//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default())); -//! ``` -//! -//! # Reading data -//! -//! As far as reading Avro encoded data goes, we can just use the schema encoded with the data to -//! read them. The library will do it automatically for us, as it already does for the compression -//! codec: -//! -//! ``` -//! use apache_avro::Reader; -//! # use apache_avro::Schema; -//! # use apache_avro::types::Record; -//! # use apache_avro::Writer; -//! # -//! # let raw_schema = r#" -//! # { -//! # "type": "record", -//! # "name": "test", -//! # "fields": [ -//! # {"name": "a", "type": "long", "default": 42}, -//! # {"name": "b", "type": "string"} -//! # ] -//! # } -//! # "#; -//! # let schema = Schema::parse_str(raw_schema).unwrap(); -//! # let mut writer = Writer::new(&schema, Vec::new()).unwrap(); -//! # let mut record = Record::new(writer.schema()).unwrap(); -//! # record.put("a", 27i64); -//! # record.put("b", "foo"); -//! # writer.append_value(record).unwrap(); -//! # let input = writer.into_inner().unwrap(); -//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed -//! let reader = Reader::new(&input[..]).unwrap(); -//! ``` -//! -//! In case, instead, we want to specify a different (but compatible) reader schema from the schema -//! the data has been written with, we can just do as the following: -//! ``` -//! use apache_avro::Schema; -//! use apache_avro::Reader; -//! # use apache_avro::types::Record; -//! # use apache_avro::Writer; -//! # -//! # let writer_raw_schema = r#" -//! # { -//! # "type": "record", -//! # "name": "test", -//! # "fields": [ -//! # {"name": "a", "type": "long", "default": 42}, -//! # {"name": "b", "type": "string"} -//! # ] -//! # } -//! # "#; -//! # let writer_schema = Schema::parse_str(writer_raw_schema).unwrap(); -//! # let mut writer = Writer::new(&writer_schema, Vec::new()).unwrap(); -//! # let mut record = Record::new(writer.schema()).unwrap(); -//! # record.put("a", 27i64); -//! # record.put("b", "foo"); -//! # writer.append_value(record).unwrap(); -//! # let input = writer.into_inner().unwrap(); -//! -//! let reader_raw_schema = r#" -//! { -//! "type": "record", -//! "name": "test", -//! "fields": [ -//! {"name": "a", "type": "long", "default": 42}, -//! {"name": "b", "type": "string"}, -//! {"name": "c", "type": "long", "default": 43} -//! ] -//! } -//! "#; -//! -//! let reader_schema = Schema::parse_str(reader_raw_schema).unwrap(); -//! -//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed -//! let reader = Reader::with_schema(&reader_schema, &input[..]).unwrap(); -//! ``` -//! -//! The library will also automatically perform schema resolution while reading the data. -//! -//! For more information about schema compatibility and resolution, please refer to the -//! [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration). -//! -//! As usual, there are two ways to handle Avro data in Rust, as you can see below. -//! -//! **NOTE:** The library also provides a low-level interface for decoding a single datum in Avro -//! bytecode without markers and header (for advanced use), but we highly recommend the `Reader` -//! interface to leverage all Avro features. Please read the API reference in case you are -//! interested. -//! -//! -//! ## The avro way -//! -//! We can just read directly instances of `Value` out of the `Reader` iterator: -//! -//! ``` -//! # use apache_avro::Schema; -//! # use apache_avro::types::Record; -//! # use apache_avro::Writer; -//! use apache_avro::Reader; -//! # -//! # let raw_schema = r#" -//! # { -//! # "type": "record", -//! # "name": "test", -//! # "fields": [ -//! # {"name": "a", "type": "long", "default": 42}, -//! # {"name": "b", "type": "string"} -//! # ] -//! # } -//! # "#; -//! # let schema = Schema::parse_str(raw_schema).unwrap(); -//! # let schema = Schema::parse_str(raw_schema).unwrap(); -//! # let mut writer = Writer::new(&schema, Vec::new()).unwrap(); -//! # let mut record = Record::new(writer.schema()).unwrap(); -//! # record.put("a", 27i64); -//! # record.put("b", "foo"); -//! # writer.append_value(record).unwrap(); -//! # let input = writer.into_inner().unwrap(); -//! let reader = Reader::new(&input[..]).unwrap(); -//! -//! // value is a Result of an Avro Value in case the read operation fails -//! for value in reader { -//! println!("{:?}", value.unwrap()); -//! } -//! -//! ``` -//! -//! ## The serde way -//! -//! Alternatively, we can use a Rust type implementing `Deserialize` and representing our schema to -//! read the data into: -//! -//! ``` -//! # use apache_avro::Schema; -//! # use apache_avro::Writer; -//! # use serde::{Deserialize, Serialize}; -//! use apache_avro::Reader; -//! use apache_avro::from_value; -//! -//! # #[derive(Serialize)] -//! #[derive(Debug, Deserialize)] -//! struct Test { -//! a: i64, -//! b: String, -//! } -//! -//! # let raw_schema = r#" -//! # { -//! # "type": "record", -//! # "name": "test", -//! # "fields": [ -//! # {"name": "a", "type": "long", "default": 42}, -//! # {"name": "b", "type": "string"} -//! # ] -//! # } -//! # "#; -//! # let schema = Schema::parse_str(raw_schema).unwrap(); -//! # let mut writer = Writer::new(&schema, Vec::new()).unwrap(); -//! # let test = Test { -//! # a: 27, -//! # b: "foo".to_owned(), -//! # }; -//! # writer.append_ser(test).unwrap(); -//! # let input = writer.into_inner().unwrap(); -//! let reader = Reader::new(&input[..]).unwrap(); -//! -//! // value is a Result in case the read operation fails -//! for value in reader { -//! println!("{:?}", from_value::<Test>(&value.unwrap())); -//! } -//! ``` -//! -//! # Putting everything together -//! -//! The following is an example of how to combine everything showed so far and it is meant to be a -//! quick reference of the library interface: -//! -//! ``` -//! use apache_avro::{Codec, DeflateSettings, Reader, Schema, Writer, from_value, types::Record, Error}; -//! use serde::{Deserialize, Serialize}; -//! -//! #[derive(Debug, Deserialize, Serialize)] -//! struct Test { -//! a: i64, -//! b: String, -//! } -//! -//! fn main() -> Result<(), Error> { -//! let raw_schema = r#" -//! { -//! "type": "record", -//! "name": "test", -//! "fields": [ -//! {"name": "a", "type": "long", "default": 42}, -//! {"name": "b", "type": "string"} -//! ] -//! } -//! "#; -//! -//! let schema = Schema::parse_str(raw_schema)?; -//! -//! println!("{:?}", schema); -//! -//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default())).unwrap(); -//! -//! let mut record = Record::new(writer.schema()).unwrap(); -//! record.put("a", 27i64); -//! record.put("b", "foo"); -//! -//! writer.append_value(record)?; -//! -//! let test = Test { -//! a: 27, -//! b: "foo".to_owned(), -//! }; -//! -//! writer.append_ser(test)?; -//! -//! let input = writer.into_inner()?; -//! let reader = Reader::with_schema(&schema, &input[..])?; -//! -//! for record in reader { -//! println!("{:?}", from_value::<Test>(&record?)); -//! } -//! Ok(()) -//! } -//! ``` -//! -//! `apache-avro` also supports the logical types listed in the [Avro specification](https://avro.apache.org/docs/current/specification/#logical-types): -//! -//! 1. `Decimal` using the [`num_bigint`](https://docs.rs/num-bigint/latest/num_bigint) crate -//! 1. UUID using the [`uuid`](https://docs.rs/uuid/latest/uuid) crate -//! 1. Date, Time (milli) as `i32` and Time (micro) as `i64` -//! 1. Timestamp (milli and micro) as `i64` -//! 1. Local timestamp (milli and micro) as `i64` -//! 1. Duration as a custom type with `months`, `days` and `millis` accessor methods each of which returns an `i32` -//! -//! Note that the on-disk representation is identical to the underlying primitive/complex type. -//! -//! ### Read and write logical types -//! -//! ```rust -//! use apache_avro::{ -//! types::Record, types::Value, Codec, Days, Decimal, DeflateSettings, Duration, Millis, Months, Reader, Schema, -//! Writer, Error, -//! }; -//! use num_bigint::ToBigInt; -//! -//! fn main() -> Result<(), Error> { -//! let raw_schema = r#" -//! { -//! "type": "record", -//! "name": "test", -//! "fields": [ -//! { -//! "name": "decimal_fixed", -//! "type": { -//! "type": "fixed", -//! "size": 2, -//! "name": "decimal" -//! }, -//! "logicalType": "decimal", -//! "precision": 4, -//! "scale": 2 -//! }, -//! { -//! "name": "decimal_var", -//! "type": "bytes", -//! "logicalType": "decimal", -//! "precision": 10, -//! "scale": 3 -//! }, -//! { -//! "name": "uuid", -//! "type": "string", -//! "logicalType": "uuid" -//! }, -//! { -//! "name": "date", -//! "type": "int", -//! "logicalType": "date" -//! }, -//! { -//! "name": "time_millis", -//! "type": "int", -//! "logicalType": "time-millis" -//! }, -//! { -//! "name": "time_micros", -//! "type": "long", -//! "logicalType": "time-micros" -//! }, -//! { -//! "name": "timestamp_millis", -//! "type": "long", -//! "logicalType": "timestamp-millis" -//! }, -//! { -//! "name": "timestamp_micros", -//! "type": "long", -//! "logicalType": "timestamp-micros" -//! }, -//! { -//! "name": "local_timestamp_millis", -//! "type": "long", -//! "logicalType": "local-timestamp-millis" -//! }, -//! { -//! "name": "local_timestamp_micros", -//! "type": "long", -//! "logicalType": "local-timestamp-micros" -//! }, -//! { -//! "name": "duration", -//! "type": { -//! "type": "fixed", -//! "size": 12, -//! "name": "duration" -//! }, -//! "logicalType": "duration" -//! } -//! ] -//! } -//! "#; -//! -//! let schema = Schema::parse_str(raw_schema)?; -//! -//! println!("{:?}", schema); -//! -//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default())).unwrap() ; -//! -//! let mut record = Record::new(writer.schema()).unwrap(); -//! record.put("decimal_fixed", Decimal::from(9936.to_bigint().unwrap().to_signed_bytes_be())); -//! record.put("decimal_var", Decimal::from(((-32442).to_bigint().unwrap()).to_signed_bytes_be())); -//! record.put("uuid", uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap()); -//! record.put("date", Value::Date(1)); -//! record.put("time_millis", Value::TimeMillis(2)); -//! record.put("time_micros", Value::TimeMicros(3)); -//! record.put("timestamp_millis", Value::TimestampMillis(4)); -//! record.put("timestamp_micros", Value::TimestampMicros(5)); -//! record.put("timestamp_nanos", Value::TimestampNanos(6)); -//! record.put("local_timestamp_millis", Value::LocalTimestampMillis(4)); -//! record.put("local_timestamp_micros", Value::LocalTimestampMicros(5)); -//! record.put("local_timestamp_nanos", Value::LocalTimestampMicros(6)); -//! record.put("duration", Duration::new(Months::new(6), Days::new(7), Millis::new(8))); -//! -//! writer.append_value(record)?; -//! -//! let input = writer.into_inner()?; -//! let reader = Reader::with_schema(&schema, &input[..])?; -//! -//! for record in reader { -//! println!("{:?}", record?); -//! } -//! Ok(()) -//! } -//! ``` -//! -//! ## Calculate Avro schema fingerprint -//! -//! This library supports calculating the following fingerprints: -//! -//! - SHA-256 -//! - MD5 -//! - Rabin -//! -//! An example of fingerprinting for the supported fingerprints: -//! -//! ```rust -//! use apache_avro::rabin::Rabin; -//! use apache_avro::{Schema, Error}; -//! use md5::Md5; -//! use sha2::Sha256; -//! -//! fn main() -> Result<(), Error> { -//! let raw_schema = r#" -//! { -//! "type": "record", -//! "name": "test", -//! "fields": [ -//! {"name": "a", "type": "long", "default": 42}, -//! {"name": "b", "type": "string"} -//! ] -//! } -//! "#; -//! let schema = Schema::parse_str(raw_schema)?; -//! println!("{}", schema.fingerprint::<Sha256>()); -//! println!("{}", schema.fingerprint::<Md5>()); -//! println!("{}", schema.fingerprint::<Rabin>()); -//! Ok(()) -//! } -//! ``` -//! -//! ## Ill-formed data -//! -//! In order to ease decoding, the Binary Encoding specification of Avro data -//! requires some fields to have their length encoded alongside the data. -//! -//! If encoded data passed to a `Reader` has been ill-formed, it can happen that -//! the bytes meant to contain the length of data are bogus and could result -//! in extravagant memory allocation. -//! -//! To shield users from ill-formed data, `apache-avro` sets a limit (default: 512MB) -//! to any allocation it will perform when decoding data. -//! -//! If you expect some of your data fields to be larger than this limit, be sure -//! to make use of the `max_allocation_bytes` function before reading **any** data -//! (we leverage Rust's [`std::sync::Once`](https://doc.rust-lang.org/std/sync/struct.Once.html) -//! mechanism to initialize this value, if -//! any call to decode is made before a call to `max_allocation_bytes`, the limit -//! will be 512MB throughout the lifetime of the program). -//! -//! -//! ```rust -//! use apache_avro::max_allocation_bytes; -//! -//! max_allocation_bytes(2 * 1024 * 1024 * 1024); // 2GB -//! -//! // ... happily decode large data -//! -//! ``` -//! -//! ## Check schemas compatibility -//! -//! This library supports checking for schemas compatibility. -//! -//! Examples of checking for compatibility: -//! -//! 1. Compatible schemas -//! -//! Explanation: an int array schema can be read by a long array schema- an int -//! (32bit signed integer) fits into a long (64bit signed integer) -//! -//! ```rust -//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility}; -//! -//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap(); -//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap(); -//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_ok()); -//! ``` -//! -//! 2. Incompatible schemas (a long array schema cannot be read by an int array schema) -//! -//! Explanation: a long array schema cannot be read by an int array schema- a -//! long (64bit signed integer) does not fit into an int (32bit signed integer) -//! -//! ```rust -//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility}; -//! -//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap(); -//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap(); -//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_err()); -//! ``` -//! ## Custom names validators -//! -//! By default the library follows the rules by the -//! [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#names)! -//! -//! Some of the other Apache Avro language SDKs are not that strict and allow more -//! characters in names. For interoperability with those SDKs, the library provides -//! a way to customize the names validation. -//! -//! ```rust -//! use apache_avro::AvroResult; -//! use apache_avro::schema::Namespace; -//! use apache_avro::validator::{SchemaNameValidator, set_schema_name_validator}; -//! -//! struct MyCustomValidator; -//! -//! impl SchemaNameValidator for MyCustomValidator { -//! fn validate(&self, name: &str) -> AvroResult<(String, Namespace)> { -//! todo!() -//! } -//! } -//! -//! // don't parse any schema before registering the custom validator(s) ! -//! -//! set_schema_name_validator(Box::new(MyCustomValidator)); -//! -//! // ... use the library -//! ``` -//! -//! Similar logic could be applied to the schema namespace, enum symbols and field names validation. -//! -//! **Note**: the library allows to set a validator only once per the application lifetime! -//! If the application parses schemas before setting a validator, the default validator will be -//! registered and used! -//! -//! ## Custom schema equality comparators -//! -//! The library provides two implementations of schema equality comparators: -//! 1. `SpecificationEq` - a comparator that serializes the schemas to their -//! canonical forms (i.e. JSON) and compares them as strings. It is the only implementation -//! until apache_avro 0.16.0. -//! See the [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas) -//! for more information! -//! 2. `StructFieldEq` - a comparator that compares the schemas structurally. -//! It is faster than the `SpecificationEq` because it returns `false` as soon as a difference -//! is found and is recommended for use! -//! It is the default comparator since apache_avro 0.17.0. -//! -//! To use a custom comparator, you need to implement the `SchemataEq` trait and set it using the -//! `set_schemata_equality_comparator` function: -//! -//! ```rust -//! use apache_avro::{AvroResult, Schema}; -//! use apache_avro::schema::Namespace; -//! use apache_avro::schema_equality::{SchemataEq, set_schemata_equality_comparator}; -//! -//! #[derive(Debug)] -//! struct MyCustomSchemataEq; -//! -//! impl SchemataEq for MyCustomSchemataEq { -//! fn compare(&self, schema_one: &Schema, schema_two: &Schema) -> bool { -//! todo!() -//! } -//! } -//! -//! // don't parse any schema before registering the custom comparator ! -//! -//! set_schemata_equality_comparator(Box::new(MyCustomSchemataEq)); -//! -//! // ... use the library -//! ``` -//! **Note**: the library allows to set a comparator only once per the application lifetime! -//! If the application parses schemas before setting a comparator, the default comparator will be -//! registered and used! -//! -//! ## Deserializing Avro Byte Arrays -//! -//! If using the Serde way to deserialize avro files, there are sometimes special derive statements -//! that need to be applied in the case of byte arrays. -//! -//! ```rust -//! use serde::{Deserialize, Serialize}; -//! -//! #[derive(Debug, Deserialize, Serialize)] -//! struct SampleStruct { -//! #[serde(with = "apache_avro::serde::bytes")] -//! non_optional_bytes: Vec<u8>, -//! #[serde(with = "apache_avro::serde::bytes_opt")] -//! optional_bytes: Option<Vec<u8>>, -//! #[serde(with = "apache_avro::serde::fixed")] -//! non_optional_fixed: [u8; 6], -//! #[serde(with = "apache_avro::serde::fixed_opt")] -//! optional_fixed: Option<[u8; 6]>, -//! } -//! ``` -//! -//! Here is a complete example of a serde round trip of a struct with a nullable byte array: -//! -//! ```rust -//! use serde::{Deserialize, Serialize}; -//! -//! #[derive(Debug, Deserialize, PartialEq, Serialize)] -//! struct ExampleByteArray { -//! #[serde(with = "apache_avro::serde::bytes_opt")] -//! data_bytes: Option<Vec<u8>>, -//! description: Option<String>, -//! } -//! -//! fn serde_byte_array() { -//! let raw_schema = r#" -//! { -//! "type": "record", -//! "name": "SimpleRecord", -//! "fields": [ -//! {"name": "data_bytes", "type": ["null", "bytes"], "default": null}, -//! {"name": "description", "type": ["null", "string"], "default": null} -//! ] -//! }"#; -//! -//! let schema = apache_avro::Schema::parse_str(raw_schema).unwrap(); -//! -//! // Create vector of ExampleByteArray -//! let records = vec![ -//! ExampleByteArray { -//! data_bytes: Some(vec![1, 2, 3, 4, 5]), -//! description: Some("First record".to_string()), -//! }, -//! ExampleByteArray { -//! data_bytes: None, -//! description: Some("Second record".to_string()), -//! }, -//! ExampleByteArray { -//! data_bytes: Some(vec![10, 20, 30]), -//! description: None, -//! }, -//! ]; -//! -//! // Serialize records to Avro binary format with the schema -//! let mut writer = apache_avro::Writer::new(&schema, Vec::new()).unwrap(); -//! for record in &records { -//! writer.append_ser(record).unwrap(); -//! } -//! -//! let avro_data = writer.into_inner().unwrap(); -//! -//! -//! // Deserialize Avro binary data back into ExampleByteArray structs -//! let reader = apache_avro::Reader::new(&avro_data[..]).unwrap(); -//! let deserialized_records: Vec<ExampleByteArray> = reader -//! .map(|value| apache_avro::from_value::<ExampleByteArray>(&value.unwrap()).unwrap()) -//! .collect(); -//! -//! assert_eq!(records, deserialized_records); -//! } -//! ``` mod bigdecimal; mod bytes; @@ -949,6 +72,8 @@ mod encode; mod reader; mod writer; +#[cfg(doc)] +pub mod documentation; pub mod error; pub mod headers; pub mod rabin; diff --git a/avro/src/rabin.rs b/avro/src/rabin.rs index 2587a6d..497f476 100644 --- a/avro/src/rabin.rs +++ b/avro/src/rabin.rs @@ -43,12 +43,12 @@ fn fp_table() -> &'static [i64; 256] { }) } -/// Implementation of the Rabin fingerprint algorithm using the Digest trait as described in [schema_fingerprints](https://avro.apache.org/docs/current/specification/#schema-fingerprints). +/// Implementation of the Rabin fingerprint algorithm using the [`Digest`](digest::Digest) trait as described in [schema fingerprints]. /// /// The digest is returned as the 8-byte little-endian encoding of the Rabin hash. -/// This is what is used for avro [single object encoding](https://avro.apache.org/docs/current/specification/#single-object-encoding) +/// This is what is used for Avro [single object encoding] /// -/// ```rust +/// ``` /// use apache_avro::rabin::Rabin; /// use digest::Digest; /// use hex_literal::hex; @@ -65,9 +65,9 @@ fn fp_table() -> &'static [i64; 256] { /// assert_eq!(result[..], hex!("60335ba6d0415528")); /// ``` /// -/// To convert the digest to the commonly used 64-bit integer value, you can use the i64::from_le_bytes() function +/// To convert the digest to the commonly used 64-bit integer value, you can use the [`i64::from_le_bytes()`] function /// -/// ```rust +/// ``` /// # use apache_avro::rabin::Rabin; /// # use digest::Digest; /// # use hex_literal::hex; @@ -84,6 +84,8 @@ fn fp_table() -> &'static [i64; 256] { /// /// assert_eq!(i, 2906301498937520992) /// ``` +/// [single object encoding](https://avro.apache.org/docs/current/specification/#single-object-encoding) +/// [schema fingerprints](https://avro.apache.org/docs/current/specification/#schema-fingerprints) #[derive(Clone)] pub struct Rabin { result: i64, diff --git a/avro/src/reader.rs b/avro/src/reader.rs index 3c3b740..8af522c 100644 --- a/avro/src/reader.rs +++ b/avro/src/reader.rs @@ -459,12 +459,12 @@ pub fn from_avro_datum<R: Read>( } } -/// Decode a `Value` encoded in Avro format given the provided `Schema` and anything implementing `io::Read` -/// to read from. +/// Decode a `Value` from raw Avro data. +/// /// If the writer schema is incomplete, i.e. contains `Schema::Ref`s then it will use the provided /// schemata to resolve any dependencies. /// -/// In case a reader `Schema` is provided, schema resolution will also be performed. +/// When a reader `Schema` is provided, schema resolution will also be performed. pub fn from_avro_datum_schemata<R: Read>( writer_schema: &Schema, writer_schemata: Vec<&Schema>, @@ -480,12 +480,12 @@ pub fn from_avro_datum_schemata<R: Read>( ) } -/// Decode a `Value` encoded in Avro format given the provided `Schema` and anything implementing `io::Read` -/// to read from. +/// Decode a `Value` from raw Avro data. +/// /// If the writer schema is incomplete, i.e. contains `Schema::Ref`s then it will use the provided /// schemata to resolve any dependencies. /// -/// In case a reader `Schema` is provided, schema resolution will also be performed. +/// When a reader `Schema` is provided, schema resolution will also be performed. pub fn from_avro_datum_reader_schemata<R: Read>( writer_schema: &Schema, writer_schemata: Vec<&Schema>, diff --git a/avro/src/schema/mod.rs b/avro/src/schema/mod.rs index 0ac4803..e1c6c7d 100644 --- a/avro/src/schema/mod.rs +++ b/avro/src/schema/mod.rs @@ -59,7 +59,8 @@ pub use crate::schema::{ /// Represents documentation for complex Avro schemas. pub type Documentation = Option<String>; -/// Represents an Avro schema fingerprint +/// Represents an Avro schema fingerprint. +/// /// More information about Avro schema fingerprints can be found in the /// [Avro Schema Fingerprint documentation](https://avro.apache.org/docs/current/specification/#schema-fingerprints) pub struct SchemaFingerprint { @@ -82,7 +83,7 @@ impl fmt::Display for SchemaFingerprint { /// Represents any valid Avro schema /// More information about Avro schemas can be found in the -/// [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration) +/// [Avro Specification](https://avro.apache.org/docs/++version++/specification/#schema-declaration) #[derive(Clone, Debug, EnumDiscriminants, Display)] #[strum_discriminants(name(SchemaKind), derive(Hash, Ord, PartialOrd))] pub enum Schema { @@ -99,17 +100,20 @@ pub enum Schema { /// A `double` Avro schema. Double, /// A `bytes` Avro schema. + /// /// `Bytes` represents a sequence of 8-bit unsigned bytes. Bytes, /// A `string` Avro schema. + /// /// `String` represents a unicode character sequence. String, - /// A `array` Avro schema. Avro arrays are required to have the same type for each element. - /// This variant holds the `Schema` for the array element type. + /// A `array` Avro schema. + /// + /// All items will have the same schema. Array(ArraySchema), /// A `map` Avro schema. - /// `Map` holds a pointer to the `Schema` of its values, which must all be the same schema. - /// `Map` keys are assumed to be `string`. + /// + /// Keys are always a `Schema::String` and all values will have the same schema. Map(MapSchema), /// A `union` Avro schema. Union(UnionSchema), @@ -119,22 +123,27 @@ pub enum Schema { Enum(EnumSchema), /// A `fixed` Avro schema. Fixed(FixedSchema), - /// Logical type which represents `Decimal` values. The underlying type is serialized and - /// deserialized as `Schema::Bytes` or `Schema::Fixed`. + /// Logical type which represents `Decimal` values. + /// + /// The underlying type is serialized and deserialized as `Schema::Bytes` or `Schema::Fixed`. Decimal(DecimalSchema), /// Logical type which represents `Decimal` values without predefined scale. + /// /// The underlying type is serialized and deserialized as `Schema::Bytes` BigDecimal, /// A universally unique identifier, annotating a string, bytes or fixed. Uuid(UuidSchema), /// Logical type which represents the number of days since the unix epoch. + /// /// Serialization format is `Schema::Int`. Date, - /// The time of day in number of milliseconds after midnight with no reference any calendar, - /// time zone or date in particular. + /// The time of day in number of milliseconds after midnight. + /// + /// This type has no reference to any calendar, time zone or date in particular. TimeMillis, - /// The time of day in number of microseconds after midnight with no reference any calendar, - /// time zone or date in particular. + /// The time of day in number of microseconds after midnight. + /// + /// This type has no reference to any calendar, time zone or date in particular. TimeMicros, /// An instant in time represented as the number of milliseconds after the UNIX epoch. TimestampMillis, @@ -397,7 +406,33 @@ impl Schema { Ok(this.canonical_form()) } - /// Generate [fingerprint] of Schema's [Parsing Canonical Form]. + /// Generate the [fingerprint] of the schema's [Parsing Canonical Form]. + /// + /// # Example + /// ``` + /// use apache_avro::rabin::Rabin; + /// use apache_avro::{Schema, Error}; + /// use md5::Md5; + /// use sha2::Sha256; + /// + /// fn main() -> Result<(), Error> { + /// let raw_schema = r#" + /// { + /// "type": "record", + /// "name": "test", + /// "fields": [ + /// {"name": "a", "type": "long", "default": 42}, + /// {"name": "b", "type": "string"} + /// ] + /// } + /// "#; + /// let schema = Schema::parse_str(raw_schema)?; + /// println!("{}", schema.fingerprint::<Sha256>()); + /// println!("{}", schema.fingerprint::<Md5>()); + /// println!("{}", schema.fingerprint::<Rabin>()); + /// Ok(()) + /// } + /// ``` /// /// [Parsing Canonical Form]: /// https://avro.apache.org/docs/current/specification/#parsing-canonical-form-for-schemas @@ -608,7 +643,7 @@ impl Schema { } } - /// Returns a Schema::Map with the given types. + /// Returns a `Schema::Map` with the given types. pub fn map(types: Schema) -> Self { Schema::Map(MapSchema { types: Box::new(types), @@ -616,7 +651,7 @@ impl Schema { }) } - /// Returns a Schema::Map with the given types and custom attributes. + /// Returns a `Schema::Map` with the given types and custom attributes. pub fn map_with_attributes(types: Schema, attributes: BTreeMap<String, Value>) -> Self { Schema::Map(MapSchema { types: Box::new(types), @@ -624,7 +659,7 @@ impl Schema { }) } - /// Returns a Schema::Array with the given items. + /// Returns a `Schema::Array` with the given items. pub fn array(items: Schema) -> Self { Schema::Array(ArraySchema { items: Box::new(items), @@ -632,7 +667,7 @@ impl Schema { }) } - /// Returns a Schema::Array with the given items and custom attributes. + /// Returns a `Schema::Array` with the given items and custom attributes. pub fn array_with_attributes(items: Schema, attributes: BTreeMap<String, Value>) -> Self { Schema::Array(ArraySchema { items: Box::new(items), @@ -903,8 +938,9 @@ impl Serialize for Schema { } } -/// Parses a **valid** avro schema into the Parsing Canonical Form. -/// https://avro.apache.org/docs/current/specification/#parsing-canonical-form-for-schemas +/// Parses a valid Avro schema into [the Parsing Canonical Form]. +/// +/// [the Parsing Canonical From](https://avro.apache.org/docs/current/specification/#parsing-canonical-form-for-schemas) fn parsing_canonical_form(schema: &Value, defined_names: &mut HashSet<String>) -> String { match schema { Value::Object(map) => pcf_map(map, defined_names), diff --git a/avro/src/schema/name.rs b/avro/src/schema/name.rs index 624443f..57c1cae 100644 --- a/avro/src/schema/name.rs +++ b/avro/src/schema/name.rs @@ -121,18 +121,19 @@ impl Name { } } - /// Return the fully qualified name needed for indexing or searching for the schema within a schema/schema env context. Puts the enclosing namespace into the name's namespace for clarity in schema/schema env parsing - /// ```ignore - /// use apache_avro::schema::Name; + /// Construct the fully qualified name /// + /// ``` + /// # use apache_avro::{Error, schema::Name}; /// assert_eq!( - /// Name::new("some_name")?.fully_qualified_name(&Some("some_namespace".into())), - /// Name::new("some_namespace.some_name")? + /// Name::new("some_name")?.fully_qualified_name(&Some("some_namespace".into())), + /// Name::new("some_namespace.some_name")? /// ); /// assert_eq!( - /// Name::new("some_namespace.some_name")?.fully_qualified_name(&Some("other_namespace".into())), - /// Name::new("some_namespace.some_name")? + /// Name::new("some_namespace.some_name")?.fully_qualified_name(&Some("other_namespace".into())), + /// Name::new("some_namespace.some_name")? /// ); + /// # Ok::<(), Error>(()) /// ``` pub fn fully_qualified_name(&self, enclosing_namespace: &Namespace) -> Name { Name { diff --git a/avro/src/schema/parser.rs b/avro/src/schema/parser.rs index ced79bc..2f3f6a1 100644 --- a/avro/src/schema/parser.rs +++ b/avro/src/schema/parser.rs @@ -33,12 +33,10 @@ use std::collections::{BTreeMap, HashMap, HashSet}; #[derive(Default)] pub(crate) struct Parser { input_schemas: HashMap<Name, Value>, - /// A map of name -> Schema::Ref /// Used to resolve cyclic references, i.e. when a /// field's type is a reference to its record's type resolving_schemas: Names, input_order: Vec<Name>, - /// A map of name -> fully parsed Schema /// Used to avoid parsing the same schema twice parsed_schemas: Names, } @@ -67,8 +65,9 @@ impl Parser { self.parse(&value, &None) } - /// Create an array of `Schema`'s from an iterator of JSON Avro schemas. It is allowed that - /// the schemas have cross-dependencies; these will be resolved during parsing. + /// Create an array of `Schema`'s from an iterator of JSON Avro schemas. + /// + /// It is allowed that the schemas have cross-dependencies; these will be resolved during parsing. pub(super) fn parse_list(&mut self) -> AvroResult<Vec<Schema>> { self.parse_input_schemas()?; @@ -83,7 +82,7 @@ impl Parser { Ok(parsed_schemas) } - /// Convert the input schemas to parsed_schemas + /// Convert the input schemas to `parsed_schemas`. pub(super) fn parse_input_schemas(&mut self) -> Result<(), Error> { while !self.input_schemas.is_empty() { let next_name = self @@ -103,8 +102,7 @@ impl Parser { Ok(()) } - /// Create a `Schema` from a `serde_json::Value` representing a JSON Avro - /// schema. + /// Create a `Schema` from a `serde_json::Value` representing a JSON Avro schema. pub(super) fn parse( &mut self, value: &Value, @@ -120,9 +118,7 @@ impl Parser { } } - /// Parse a `serde_json::Value` representing an Avro type whose Schema is known into a - /// `Schema`. A Schema for a `serde_json::Value` is known if it is primitive or has - /// been parsed previously by the parsed and stored in its map of parsed_schemas. + /// Parse a string as a primitive type or reference to `parsed_schemas`. fn parse_known_schema( &mut self, name: &str, @@ -142,9 +138,10 @@ impl Parser { } /// Given a name, tries to retrieve the parsed schema from `parsed_schemas`. + /// /// If a parsed schema is not found, it checks if a currently resolving /// schema with that name exists. - /// If a resolving schema is not found, it checks if a json with that name exists + /// If a resolving schema is not found, it checks if a JSON with that name exists /// in `input_schemas` and then parses it (removing it from `input_schemas`) /// and adds the parsed schema to `parsed_schemas`. /// @@ -240,11 +237,10 @@ impl Parser { } } - /// Parse a `serde_json::Value` representing a complex Avro type into a - /// `Schema`. + /// Parse a `serde_json::Value` representing a complex Avro type into a `Schema`. /// /// Avro supports "recursive" definition of types. - /// e.g: {"type": {"type": "string"}} + /// e.g: `{"type": {"type": "string"}}` pub(super) fn parse_complex( &mut self, complex: &Map<String, Value>, @@ -539,8 +535,7 @@ impl Parser { } } - /// Parse a `serde_json::Value` representing a Avro record type into a - /// `Schema`. + /// Parse a `serde_json::Value` representing a Avro record type into a `Schema`. fn parse_record( &mut self, complex: &Map<String, Value>, @@ -619,8 +614,7 @@ impl Parser { custom_attributes } - /// Parse a `serde_json::Value` representing a Avro enum type into a - /// `Schema`. + /// Parse a `serde_json::Value` representing a Avro enum type into a `Schema`. fn parse_enum( &mut self, complex: &Map<String, Value>, @@ -697,8 +691,7 @@ impl Parser { Ok(schema) } - /// Parse a `serde_json::Value` representing a Avro array type into a - /// `Schema`. + /// Parse a `serde_json::Value` representing a Avro array type into a `Schema`. fn parse_array( &mut self, complex: &Map<String, Value>, @@ -716,8 +709,7 @@ impl Parser { }) } - /// Parse a `serde_json::Value` representing a Avro map type into a - /// `Schema`. + /// Parse a `serde_json::Value` representing a Avro map type into a `Schema`. fn parse_map( &mut self, complex: &Map<String, Value>, @@ -735,8 +727,7 @@ impl Parser { }) } - /// Parse a `serde_json::Value` representing a Avro union type into a - /// `Schema`. + /// Parse a `serde_json::Value` representing a Avro union type into a `Schema`. fn parse_union( &mut self, items: &[Value], @@ -764,8 +755,7 @@ impl Parser { }) } - /// Parse a `serde_json::Value` representing a Avro fixed type into a - /// `Schema`. + /// Parse a `serde_json::Value` representing a Avro fixed type into a `Schema`. fn parse_fixed( &mut self, complex: &Map<String, Value>, diff --git a/avro/src/schema/union.rs b/avro/src/schema/union.rs index 5bf631a..7510a13 100644 --- a/avro/src/schema/union.rs +++ b/avro/src/schema/union.rs @@ -36,7 +36,7 @@ pub struct UnionSchema { } impl UnionSchema { - /// Creates a new UnionSchema from a vector of schemas. + /// Creates a new `UnionSchema` from a vector of schemas. /// /// # Errors /// Will return an error if `schemas` has duplicate unnamed schemas or if `schemas` diff --git a/avro/src/schema_compatibility.rs b/avro/src/schema_compatibility.rs index 9e62978..44a0511 100644 --- a/avro/src/schema_compatibility.rs +++ b/avro/src/schema_compatibility.rs @@ -15,7 +15,48 @@ // specific language governing permissions and limitations // under the License. -//! Logic for checking schema compatibility +//! Check if the reader's schema is compatible with the writer's schema. +//! +//! To allow for schema evolution, Avro supports resolving the writer's schema to the reader's schema. +//! To check if this is possible, [`SchemaCompatibility`] can be used. For the complete rules see +//! [the specification](https://avro.apache.org/docs/++version++/specification/#schema-resolution). +//! +//! There are three levels of compatibility. +//! +//! 1. Fully compatible schemas (`Ok(Compatibility::Full)`) +//! +//! For example, an integer can always be resolved to a long: +//! +//! ``` +//! # use apache_avro::{Schema, schema_compatibility::{Compatibility, SchemaCompatibility}}; +//! let writers_schema = Schema::array(Schema::Int); +//! let readers_schema = Schema::array(Schema::Long); +//! assert_eq!(SchemaCompatibility::can_read(&writers_schema, &readers_schema), Ok(Compatibility::Full)); +//! ``` +//! +//! 2. Incompatible schemas (`Err`) +//! +//! For example, a long can never be resolved to a long: +//! +//! ``` +//! # use apache_avro::{Schema, schema_compatibility::SchemaCompatibility}; +//! let writers_schema = Schema::array(Schema::Long); +//! let readers_schema = Schema::array(Schema::Int); +//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_err()); +//! ``` +//! +//! 3. Partially compatible schemas (`Ok(Compatibility::Partial)`) +//! +//! For example, a union of a string and integer is only compatible with an integer if an integer was written: +//! +//! ``` +//! # use apache_avro::{Error, Schema, schema_compatibility::{Compatibility, SchemaCompatibility}}; +//! let writers_schema = Schema::union(vec![Schema::Int, Schema::String])?; +//! let readers_schema = Schema::Int; +//! assert_eq!(SchemaCompatibility::can_read(&writers_schema, &readers_schema), Ok(Compatibility::Partial)); +//! # Ok::<(), Error>(()) +//! ``` +//! use crate::{ error::CompatibilityError, schema::{ @@ -31,9 +72,35 @@ use std::{ ptr, }; +/// Check if two schemas can be resolved. +/// +/// See [the module documentation] for more details. +/// +/// [the module documentation]: crate::schema_compatibility pub struct SchemaCompatibility; -/// How compatible are two schemas. +impl SchemaCompatibility { + /// Recursively check if the reader's schema can be resolved to the writer's schema + pub fn can_read( + writers_schema: &Schema, + readers_schema: &Schema, + ) -> Result<Compatibility, CompatibilityError> { + let mut c = Checker::new(); + c.can_read(writers_schema, readers_schema) + } + + /// Recursively check if both schemas can be resolved to each other + pub fn mutual_read( + schema_a: &Schema, + schema_b: &Schema, + ) -> Result<Compatibility, CompatibilityError> { + let mut c = SchemaCompatibility::can_read(schema_a, schema_b)?; + c &= SchemaCompatibility::can_read(schema_b, schema_a)?; + Ok(c) + } +} + +/// How compatible two schemas are. #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum Compatibility { /// Full compatibility, resolving will always work. @@ -375,29 +442,6 @@ impl Checker { } } -impl SchemaCompatibility { - /// `can_read` performs a full, recursive check that a datum written using the - /// writers_schema can be read using the readers_schema. - pub fn can_read( - writers_schema: &Schema, - readers_schema: &Schema, - ) -> Result<Compatibility, CompatibilityError> { - let mut c = Checker::new(); - c.can_read(writers_schema, readers_schema) - } - - /// `mutual_read` performs a full, recursive check that a datum written using either - /// the writers_schema or the readers_schema can be read using the other schema. - pub fn mutual_read( - writers_schema: &Schema, - readers_schema: &Schema, - ) -> Result<Compatibility, CompatibilityError> { - let mut c = SchemaCompatibility::can_read(writers_schema, readers_schema)?; - c &= SchemaCompatibility::can_read(readers_schema, writers_schema)?; - Ok(c) - } -} - #[cfg(test)] mod tests { use std::collections::BTreeMap; diff --git a/avro/src/schema_equality.rs b/avro/src/schema_equality.rs index cd1c2cd..d8f1558 100644 --- a/avro/src/schema_equality.rs +++ b/avro/src/schema_equality.rs @@ -15,6 +15,40 @@ // specific language governing permissions and limitations // under the License. +//! # Custom schema equality comparators +//! +//! The library provides two implementations of schema equality comparators: +//! 1. `StructFieldEq` (default) - compares the schemas structurally, may slightly deviate from the specification. +//! 2. `SpecificationEq` - compares the schemas by serializing them to their canonical form and comparing +//! the resulting JSON. +//! +//! To use a custom comparator, you need to implement the `SchemataEq` trait and set it using the +//! `set_schemata_equality_comparator` function: +//! +//! ``` +//! use apache_avro::{AvroResult, Schema}; +//! use apache_avro::schema::Namespace; +//! use apache_avro::schema_equality::{SchemataEq, set_schemata_equality_comparator}; +//! +//! #[derive(Debug)] +//! struct MyCustomSchemataEq; +//! +//! impl SchemataEq for MyCustomSchemataEq { +//! fn compare(&self, schema_one: &Schema, schema_two: &Schema) -> bool { +//! todo!() +//! } +//! } +//! +//! // don't parse any schema before registering the custom comparator! +//! +//! set_schemata_equality_comparator(Box::new(MyCustomSchemataEq)); +//! +//! // ... use the library +//! ``` +//! **Note**: the library allows to set a comparator only once per the application lifetime! +//! If the application parses schemas before setting a comparator, the default comparator will be +//! registered and used! + use crate::schema::{InnerDecimalSchema, UuidSchema}; use crate::{ Schema, @@ -27,15 +61,16 @@ use log::debug; use std::{fmt::Debug, sync::OnceLock}; /// A trait that compares two schemata for equality. -/// To register a custom one use [set_schemata_equality_comparator]. +/// +/// To register a custom one use [`set_schemata_equality_comparator`]. pub trait SchemataEq: Debug + Send + Sync { /// Compares two schemata for equality. fn compare(&self, schema_one: &Schema, schema_two: &Schema) -> bool; } -/// Compares two schemas according to the Avro specification by using -/// their canonical forms. -/// See <https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas> +/// Compares two schemas according to the Avro specification by using [their canonical forms]. +/// +/// [their canonical forms](https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas) #[derive(Debug)] pub struct SpecificationEq; impl SchemataEq for SpecificationEq { @@ -44,12 +79,15 @@ impl SchemataEq for SpecificationEq { } } -/// Compares two schemas for equality field by field, using only the fields that -/// are used to construct their canonical forms. -/// See <https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas> +/// Compares [the canonical forms] of two schemas for equality field by field. +/// +/// This means that attributes like `aliases`, `doc`, `default` and `logicalType` are ignored. +/// +/// [the canonical forms](https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas) #[derive(Debug)] pub struct StructFieldEq { /// Whether to include custom attributes in the comparison. + /// /// The custom attributes are not used to construct the canonical form of the schema! pub include_attributes: bool, } diff --git a/avro/src/serde/derive.rs b/avro/src/serde/derive.rs index 5164458..0ef46f0 100644 --- a/avro/src/serde/derive.rs +++ b/avro/src/serde/derive.rs @@ -22,64 +22,288 @@ use crate::schema::{ use std::borrow::Cow; use std::collections::HashMap; -/// Trait for types that serve as an Avro data model. Derive implementation available -/// through `derive` feature. Do not implement directly! -/// Implement [`AvroSchemaComponent`] to get this trait +/// Trait for types that serve as an Avro data model. +/// +/// Do not implement directly! Either derive it or implement [`AvroSchemaComponent`] to get this trait /// through a blanket implementation. +/// +/// ## Deriving `AvroSchema` +/// +/// Using the custom derive requires that you enable the `"derive"` cargo +/// feature in your `Cargo.toml`: +/// +/// ```toml +/// [dependencies] +/// apache-avro = { version = "..", features = ["derive"] } +/// ``` +/// +/// Then, you add the `#[derive(AvroSchema)]` annotation to your `struct` and +/// `enum` type definition: +/// +/// ``` +/// # use serde::{Serialize, Deserialize}; +/// # use apache_avro::AvroSchema; +/// #[derive(AvroSchema, Serialize, Deserialize)] +/// pub struct Foo { +/// bar: Vec<Bar>, +/// } +/// +/// #[derive(AvroSchema, Serialize, Deserialize)] +/// pub enum Bar { +/// Spam, +/// Maps +/// } +/// ``` +/// +/// This will implement [`AvroSchemaComponent`] for the type, and `AvroSchema` +/// through the blanket implementation for `T: AvroSchemaComponent`. +/// +/// When deriving `struct`s, every member must also implement `AvroSchemaComponent`. +/// +/// ## Changing the generated schema +/// +/// The derive macro will read both the `avro` and `serde` attributes to modify the generated schema. +/// It will also check for compatibility between the various attributes. +/// +/// #### Container attributes +/// +/// - `#[serde(rename = "name")]` +/// +// TODO: Should we check if `name` contains any dots? As that would imply a namespace +/// Set the `name` of the schema to the given string. Defaults to the name of the type. +/// +/// - `#[avro(namespace = "some.name.space")]` +/// +/// Set the `namespace` of the schema. This will be the relative namespace if the schema is included +/// in another schema. +/// +/// - `#[avro(doc = "Some documentation")]` +/// +/// Set the `doc` attribute of the schema. Defaults to the documentation of the type. +/// +/// - `#[avro(alias = "name")]` +/// +/// Set the `alias` attribute of the schema. Can be specified multiple times. +/// +/// - `#[serde(rename_all = "camelCase")]` +/// +/// Rename all the fields or variants in the schema to follow the given case convention. The possible values +/// are `"lowercase"`, `"UPPERCASE"`, `"PascalCase"`, `"camelCase"`, `"snake_case"`, `"kebab-case"`, +/// `"SCREAMING_SNAKE_CASE"`, `"SCREAMING-KEBAB-CASE"`. +/// +/// - `#[serde(transparent)]` +/// +/// Use the schema of the inner field directly. Is only allowed on structs with only unskipped field. +/// +/// +/// #### Variant attributes +/// +/// - `#[serde(rename = "name")]` +/// +/// Rename the variant to the given name. +/// +/// +/// #### Field attributes +/// +/// - `#[serde(rename = "name")]` +/// +/// Rename the field name to the given name. +/// +/// - `#[avro(doc = "Some documentation")]` +/// +/// Set the `doc` attribute of the field. Defaults to the documentation of the field. +/// +/// - `#[avro(default = "null")]` +/// +/// Set the `default` attribute of the field. +/// +/// _Note:_ This is a JSON value not a Rust value, as this is put in the schema itself. +/// +/// - `#[serde(alias = "name")]` +/// +/// Set the `alias` attribute of the field. Can be specified multiple times. +/// +/// - `#[serde(flatten)]` +/// +/// Flatten the content of this field into the container it is defined in. +/// +/// - `#[serde(skip)]` +/// +/// Do not include this field in the schema. +/// +/// - `#[serde(skip_serializing)]` +/// +/// When combined with `#[serde(skip_deserializing)]`, don't include this field in the schema. +/// Otherwise, it will be included in the schema and the `#[avro(default)]` attribute **must** be +/// set. That value will be used for serializing. +/// +/// - `#[serde(skip_serializing_if)]` +/// +/// Conditionally use the value of the field or the value provided by `#[avro(default)]`. The +/// `#[avro(default)]` attribute **must** be set. +/// +/// - `#[avro(with)]` and `#[serde(with = "module")]` +/// +/// Override the schema used for this field. See [Working with foreign types](#working-with-foreign-types). +/// +/// #### Incompatible Serde attributes +/// +/// The derive macro is compatible with most Serde attributes, but it is incompatible with +/// the following attributes: +/// +/// - Container attributes +/// - `tag` +/// - `content` +/// - `untagged` +/// - `variant_identifier` +/// - `field_identifier` +/// - `remote` +/// - `rename_all(serialize = "..", deserialize = "..")` where `serialize` != `deserialize` +/// - Variant attributes +/// - `other` +/// - `untagged` +/// - Field attributes +/// - `getter` +/// +/// ## Working with foreign types +/// +/// Most foreign types won't have a [`AvroSchema`] implementation. This crate implements it only +/// for built-in types and [`uuid::Uuid`]. +/// +/// To still be able to derive schemas for fields of foreign types, the `#[avro(with)`] +/// attribute can be used to get the schema for those fields. It can be used in two ways: +/// +/// 1. In combination with `#[serde(with = "path::to::module)]` +/// +/// To get the schema, it will call the functions `fn get_schema_in_ctxt(&mut Names, &Namespace) -> Schema` +/// and `fn get_record_fields_in_ctxt(&mut Names, &Namespace) -> Schema` in the module provided +/// to the Serde attribute. See [`AvroSchemaComponent`] for details on how to implement those +/// functions. +/// +/// 2. By providing a function directly, `#[avro(with = some_fn)]`. +/// +/// To get the schema, it will call the function provided. It must have the signature +/// `fn(&mut Names, &Namespace) -> Schema`. When this is used for a `transparent` struct, the +/// default implementation of [`AvroSchemaComponent::get_record_fields_in_ctxt`] will be used. +/// This is only recommended for primitive types, as the default implementation cannot be efficiently +/// implemented for complex types. +/// pub trait AvroSchema { + /// Construct the full schema that represents this type. + /// + /// The returned schema is fully independent and contains only `Schema::Ref` to named types defined + /// earlier in the schema. fn get_schema() -> Schema; } -/// Trait for types that serve as fully defined components inside an Avro data model. Derive -/// implementation available through `derive` feature. This is what is implemented by -/// the `derive(AvroSchema)` macro. +/// Trait for types that serve as fully defined components inside an Avro data model. +/// +/// This trait can be derived with [`#[derive(AvroSchema)]`](AvroSchema) when the `derive` feature is enabled. /// /// # Implementation guide /// -/// ### Simple implementation -/// To construct a non named simple schema, it is possible to ignore the input argument making the -/// general form implementation look like -/// ```ignore -/// impl AvroSchemaComponent for AType { +/// ### Implementation for returning primitive types +/// When the schema you want to return is a primitive type (a type without a name), the function +/// arguments can be ignored. +/// +/// For example, you have a custom integer type: +/// ``` +/// # use apache_avro::{Schema, serde::{AvroSchemaComponent}, schema::{Names, Namespace, RecordField}}; +/// // Make sure to implement `Serialize` and `Deserialize` to use the right serialization methods +/// pub struct U24([u8; 3]); +/// impl AvroSchemaComponent for U24 { /// fn get_schema_in_ctxt(_: &mut Names, _: &Namespace) -> Schema { -/// Schema::? -/// } +/// Schema::Int +/// } +/// +/// fn get_record_fields_in_ctxt(_: usize, _: &mut Names, _: &Namespace) -> Option<Vec<RecordField>> { +/// None // A Schema::Int is not a Schema::Record so there are no fields to return +/// } ///} /// ``` /// /// ### Passthrough implementation /// -/// To construct a schema for a Type that acts as in "inner" type, such as for smart pointers, simply -/// pass through the arguments to the inner type -/// ```ignore -/// impl AvroSchemaComponent for PassthroughType { +/// To construct a schema for a type is "transparent", such as for smart pointers, simply +/// pass through the arguments to the inner type: +/// ``` +/// # use apache_avro::{Schema, serde::{AvroSchemaComponent}, schema::{Names, Namespace, RecordField}}; +/// # use serde::{Serialize, Deserialize}; +/// #[derive(Serialize, Deserialize)] +/// #[serde(transparent)] // This attribute is important for all passthrough implementations! +/// pub struct Transparent<T>(T); +/// impl<T: AvroSchemaComponent> AvroSchemaComponent for Transparent<T> { /// fn get_schema_in_ctxt(named_schemas: &mut Names, enclosing_namespace: &Namespace) -> Schema { -/// InnerType::get_schema_in_ctxt(named_schemas, enclosing_namespace) -/// } +/// T::get_schema_in_ctxt(named_schemas, enclosing_namespace) +/// } +/// +/// fn get_record_fields_in_ctxt(first_field_position: usize, named_schemas: &mut Names, enclosing_namespace: &Namespace) -> Option<Vec<RecordField>> { +/// T::get_record_fields_in_ctxt(first_field_position, named_schemas, enclosing_namespace) +/// } ///} /// ``` /// -/// ### Complex implementation +/// ### Implementation for complex types +/// When the schema you want to return is a complex type (a type with a name), special care has to +/// be taken to avoid duplicate type definitions and getting the correct namespace. /// -/// To implement this for Named schema there is a general form needed to avoid creating invalid -/// schemas or infinite loops. -/// ```ignore -/// impl AvroSchemaComponent for ComplexType { +/// Things to keep in mind: +/// - If the fully qualified name already exists, return a [`Schema::Ref`] +/// - Use the `AvroSchemaComponent` implementations to get the schemas for the subtypes +/// - The ordering of fields in the schema **must** match with the ordering in Serde +/// - Implement `get_record_fields_in_ctxt` as the default implementation has to be implemented +/// with backtracking and a lot of cloning. +/// - Even if your schema is not a record, still implement the function and just return `None` +/// +/// ``` +/// # use apache_avro::{Schema, serde::{AvroSchemaComponent}, schema::{Name, Names, Namespace, RecordField, RecordSchema}}; +/// # use serde::{Serialize, Deserialize}; +/// # use std::time::Duration; +/// pub struct Foo { +/// one: String, +/// two: i32, +/// three: Option<Duration> +/// } +/// +/// impl AvroSchemaComponent for Foo { /// fn get_schema_in_ctxt(named_schemas: &mut Names, enclosing_namespace: &Namespace) -> Schema { /// // Create the fully qualified name for your type given the enclosing namespace -/// let name = apache_avro::schema::Name::new("MyName") -/// .expect("Unable to parse schema name") -/// .fully_qualified_name(enclosing_namespace); -/// let enclosing_namespace = &name.namespace; -/// // Check, if your name is already defined, and if so, return a ref to that name +/// let name = Name::new("Foo").unwrap().fully_qualified_name(enclosing_namespace); /// if named_schemas.contains_key(&name) { -/// apache_avro::schema::Schema::Ref{name: name.clone()} +/// Schema::Ref { name } /// } else { -/// named_schemas.insert(name.clone(), apache_avro::schema::Schema::Ref{name: name.clone()}); -/// // YOUR SCHEMA DEFINITION HERE with the name equivalent to "MyName". -/// // For non-simple sub types delegate to their implementation of AvroSchemaComponent +/// let enclosing_namespace = &name.namespace; +/// // This is needed because otherwise recursive types will recurse forever and cause a stack overflow +/// named_schemas.insert(name.clone(), Schema::Ref { name: name.clone() }); +/// let schema = Schema::Record(RecordSchema::builder() +/// .name(name.clone()) +/// .fields(Self::get_record_fields_in_ctxt(0, named_schemas, enclosing_namespace).expect("Impossible!")) +/// .build() +/// ); +/// named_schemas.insert(name, schema.clone()); +/// schema /// } -/// } +/// } +/// +/// fn get_record_fields_in_ctxt(first_field_position: usize, named_schemas: &mut Names, enclosing_namespace: &Namespace) -> Option<Vec<RecordField>> { +/// Some(vec![ +/// RecordField::builder() +/// .name("one") +/// .schema(String::get_schema_in_ctxt(named_schemas, enclosing_namespace)) +/// .position(first_field_position) +/// .build(), +/// RecordField::builder() +/// .name("two") +/// .schema(i32::get_schema_in_ctxt(named_schemas, enclosing_namespace)) +/// .position(first_field_position+1) +/// .build(), +/// RecordField::builder() +/// .name("three") +/// .schema(<Option<Duration>>::get_schema_in_ctxt(named_schemas, enclosing_namespace)) +/// .position(first_field_position+2) +/// .build(), +/// ]) +/// } ///} /// ``` pub trait AvroSchemaComponent { diff --git a/avro/src/serde/mod.rs b/avro/src/serde/mod.rs index 2a62b33..6065240 100644 --- a/avro/src/serde/mod.rs +++ b/avro/src/serde/mod.rs @@ -15,6 +15,98 @@ // specific language governing permissions and limitations // under the License. +//! # Using Avro in Rust, the Serde way. +//! +//! Avro is a schema-based format, this means it requires a few extra steps to use compared to +//! a data format like JSON. +//! +//! ## Schemas +//! It's strongly recommended to derive the schemas for your types using the [`AvroSchema`] derive macro. +//! The macro uses the Serde attributes to generate a matching schema and checks that no attributes are +//! used that are incompatible with the Serde implementation in this crate. See [the trait documentation] for +//! details on how to change the generated schema. +//! +//! Alternatively, you can write your own schema. If you go down this path, it is recommended you start with +//! the schema derived by [`AvroSchema`] and then modify it to fit your needs. +//! +//! #### Performance pitfall +//! One performance pitfall with Serde is (de)serializing bytes. The implementation of [`Serialize`][`serde::Serialize`] +//! and [`Deserialize`][`serde::Deserialize`] for types as `Vec<u8>`, `&[u8]` and `Cow<[u8]>` will +//! all use the array of integers representation. This can normally be fixed using the [`serde_bytes`] +//! crate, however this crate also needs some extra information. Therefore, you need to use the +//! [`bytes`], [`bytes_opt`], [`fixed`], [`fixed_opt`], [`mod@slice`], and [`slice_opt`] modules of +//! this crate instead. +//! +//! #### Using existing schemas +//! If you have schemas that are already being used in other parts of your software stack, generating types +//! from the schema can be very useful. There is a **third-party** crate [`rsgen-avro`] that implements this. +//! +//! ## Serializing data +//! Writing data is very simple. Use [`T::get_schema()`](AvroSchema::get_schema()) to get the schema +//! for the type you want to serialize. It is recommended to keep this schema around as long as possible +//! as generating the schema is quite expensive. Then create a [`Writer`](crate::Writer) with your schema +//! and use the [`append_ser()`](crate::Writer::append_ser()) function to serialize your data. +//! +//! ## Deserializing data +//! Reading data is both simpler and more complex than writing. On the one hand, you don't need to +//! generate a schema, as the Avro file has it embedded. But you can't directly deserialize from a +//! [`Reader`](crate::Reader). Instead, you have to iterate over the [`Value`](crate::types::Value)s +//! in the reader and deserialize from those via [`from_value`]. +//! +//! ## Putting it all together +//! +//! The following is an example of how to combine everything showed so far and it is meant to be a +//! quick reference of the Serde interface: +//! +//! ``` +//! # use std::io::Cursor; +//! # use serde::{Serialize, Deserialize}; +//! # use apache_avro::{AvroSchema, Error, Reader, Writer, serde::{from_value, to_value}}; +//! #[derive(AvroSchema, Serialize, Deserialize, PartialEq, Debug)] +//! struct Foo { +//! a: i64, +//! b: String, +//! // Otherwise it will be serialized as an array of integers +//! #[avro(with)] +//! #[serde(with = "apache_avro::serde::bytes")] +//! c: Vec<u8>, +//! } +//! +//! // Creating this schema is expensive, reuse it as much as possible +//! let schema = Foo::get_schema(); +//! // A writer needs the schema of the type that is going to be written +//! let mut writer = Writer::new(&schema, Vec::new())?; +//! +//! let foo = Foo { +//! a: 42, +//! b: "Hello".to_string(), +//! c: b"Data".to_vec() +//! }; +//! +//! // Serialize as many items as you want. +//! writer.append_ser(&foo)?; +//! writer.append_ser(&foo)?; +//! writer.append_ser(&foo)?; +//! +//! // Always flush +//! writer.flush(); +//! // Or consume the writer +//! let data = writer.into_inner()?; +//! +//! // The reader does not need a schema as it's included in the data +//! let reader = Reader::new(Cursor::new(data))?; +//! // The reader is an iterator +//! for result in reader { +//! let value = result?; +//! let new_foo: Foo = from_value(&value)?; +//! assert_eq!(new_foo, foo); +//! } +//! # Ok::<(), Error>(()) +//! ``` +//! +//! [`rsgen-avro`]: https://docs.rs/rsgen-avro/latest/rsgen_avro/ +//! [the trait documentation]: AvroSchema + mod de; mod derive; mod ser; diff --git a/avro/src/serde/ser_schema.rs b/avro/src/serde/ser_schema.rs index 2cfe380..63ecdad 100644 --- a/avro/src/serde/ser_schema.rs +++ b/avro/src/serde/ser_schema.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Logic for serde-compatible schema-aware serialization -//! which writes directly to a `Write` stream +//! Logic for serde-compatible schema-aware serialization which writes directly to a writer. use crate::schema::{DecimalSchema, InnerDecimalSchema, UuidSchema}; use crate::{ @@ -35,11 +34,11 @@ const COLLECTION_SERIALIZER_DEFAULT_INIT_ITEM_CAPACITY: usize = 32; const SINGLE_VALUE_INIT_BUFFER_SIZE: usize = 128; /// The sequence serializer for [`SchemaAwareWriteSerializer`]. -/// [`SchemaAwareWriteSerializeSeq`] may break large arrays up into multiple blocks to avoid having +/// +/// This may break large arrays up into multiple blocks to avoid having /// to obtain the length of the entire array before being able to write any data to the underlying -/// [`std::fmt::Write`] stream. (See the +/// writer (see the [Data Serialization and Deserialization] for more info). /// [Data Serialization and Deserialization](https://avro.apache.org/docs/1.12.0/specification/#data-serialization-and-deserialization) -/// section of the Avro spec for more info.) pub struct SchemaAwareWriteSerializeSeq<'a, 's, W: Write> { ser: &'a mut SchemaAwareWriteSerializer<'s, W>, item_schema: &'s Schema, @@ -143,11 +142,11 @@ impl<W: Write> ser::SerializeTuple for SchemaAwareWriteSerializeSeq<'_, '_, W> { } /// The map serializer for [`SchemaAwareWriteSerializer`]. -/// [`SchemaAwareWriteSerializeMap`] may break large maps up into multiple blocks to avoid having to -/// obtain the size of the entire map before being able to write any data to the underlying -/// [`std::fmt::Write`] stream. (See the +/// +/// This may break large maps up into multiple blocks to avoid having to obtain the size of the entire +/// map before being able to write any data to the underlying writer +/// (see [Data Serialization and Deserialization] for more info) /// [Data Serialization and Deserialization](https://avro.apache.org/docs/1.12.0/specification/#data-serialization-and-deserialization) -/// section of the Avro spec for more info.) pub struct SchemaAwareWriteSerializeMap<'a, 's, W: Write> { ser: &'a mut SchemaAwareWriteSerializer<'s, W>, item_schema: &'s Schema, @@ -245,9 +244,9 @@ impl<W: Write> ser::SerializeMap for SchemaAwareWriteSerializeMap<'_, '_, W> { } /// The struct serializer for [`SchemaAwareWriteSerializer`], which can serialize Avro records. -/// [`SchemaAwareWriteSerializeStruct`] can accept fields out of order, but doing so incurs a -/// performance penalty, since it requires [`SchemaAwareWriteSerializeStruct`] to buffer serialized -/// values in order to write them to the stream in order. +/// +/// This can accept fields out of order, but doing so incurs a performance penalty, since it requires +/// buffering serialized values in order to write them to the stream in order. pub struct SchemaAwareWriteSerializeStruct<'a, 's, W: Write> { ser: &'a mut SchemaAwareWriteSerializer<'s, W>, record_schema: &'s RecordSchema, @@ -427,7 +426,10 @@ impl<W: Write> ser::SerializeStruct for SchemaAwareWriteSerializeStruct<'_, '_, } } -/// This implementation is used to support `#[serde(flatten)]` as that uses SerializeMap instead of SerializeStruct. +/// This implementation is used to support `#[serde(flatten)]` as that uses [`SerializeMap`] instead of [`SerializeStruct`]. +/// +/// [`SerializeMap`](ser::SerializeMap) +/// [`SerializeStruct`](ser::SerializeStruct) impl<W: Write> ser::SerializeMap for SchemaAwareWriteSerializeStruct<'_, '_, W> { type Ok = usize; type Error = Error; @@ -531,7 +533,8 @@ impl<W: Write> ser::SerializeMap for SchemaAwareWriteSerializeMapOrStruct<'_, '_ } /// The tuple struct serializer for [`SchemaAwareWriteSerializer`]. -/// [`SchemaAwareWriteSerializeTupleStruct`] can serialize to an Avro array, record, or big-decimal. +/// +/// This can serialize to an Avro array, record, or big-decimal. /// When serializing to a record, fields must be provided in the correct order, since no names are provided. pub enum SchemaAwareWriteSerializeTupleStruct<'a, 's, W: Write> { Record(SchemaAwareWriteSerializeStruct<'a, 's, W>), @@ -593,11 +596,11 @@ impl<W: Write> ser::SerializeTupleVariant for SchemaAwareWriteSerializeTupleStru } } -/// A [`serde::ser::Serializer`] implementation that serializes directly to a [`std::fmt::Write`] -/// using the provided schema. If [`SchemaAwareWriteSerializer`] isn't able to match the incoming -/// data with its schema, it will return an error. -/// A [`SchemaAwareWriteSerializer`] instance can be re-used to serialize multiple values matching -/// the schema to its [`std::fmt::Write`] stream. +/// A [`Serializer`](ser::Serializer) implementation that serializes directly to raw Avro data. +/// +/// If data does not match with the schema it will return an error. +/// +/// This does not keep state and can therefore be reused to write to the same writer. pub struct SchemaAwareWriteSerializer<'s, W: Write> { writer: &'s mut W, root_schema: &'s Schema, diff --git a/avro/src/serde/with.rs b/avro/src/serde/with.rs index 670955b..bbcd7de 100644 --- a/avro/src/serde/with.rs +++ b/avro/src/serde/with.rs @@ -18,14 +18,14 @@ use std::cell::Cell; thread_local! { - /// A thread local that is used to decide how to serialize Rust bytes into an Avro - /// `types::Value` of type bytes. + /// A thread local that is used to decide if Rust bytes need to be serialized to + /// [`Value::Bytes`] or [`Value::Fixed`]. /// /// Relies on the fact that serde's serialization process is single-threaded. pub(crate) static SER_BYTES_TYPE: Cell<BytesType> = const { Cell::new(BytesType::Bytes) }; - /// A thread local that is used to decide how to deserialize an Avro `types::Value` - /// of type bytes into Rust bytes. + /// A thread local that is used to decide if a [`Value::Bytes`] needs to be deserialized to + /// a [`Vec`] or slice. /// /// Relies on the fact that serde's deserialization process is single-threaded. pub(crate) static DE_BYTES_BORROWED: Cell<bool> = const { Cell::new(false) }; @@ -72,11 +72,12 @@ impl Drop for BorrowedGuard { /// This module is intended to be used through the Serde `with` attribute. /// Use [`apache_avro::serde::bytes_opt`] for optional bytes. /// +/// When used with different serialization formats, this is equivalent to [`serde_bytes`]. +/// /// See usage with below example: /// ``` /// # use apache_avro::AvroSchema; /// # use serde::{Deserialize, Serialize}; -/// /// #[derive(AvroSchema, Serialize, Deserialize)] /// struct StructWithBytes { /// #[avro(with)] @@ -132,11 +133,12 @@ pub mod bytes { /// This module is intended to be used through the Serde `with` attribute. /// Use [`apache_avro::serde::bytes`] for non-optional bytes. /// +/// When used with different serialization formats, this is equivalent to [`serde_bytes`]. +/// /// See usage with below example: /// ``` /// # use apache_avro::AvroSchema; /// # use serde::{Deserialize, Serialize}; -/// /// #[derive(AvroSchema, Serialize, Deserialize)] /// struct StructWithBytes { /// #[avro(with)] @@ -196,11 +198,12 @@ pub mod bytes_opt { /// This module is intended to be used through the Serde `with` attribute. /// Use [`apache_avro::serde::fixed_opt`] for optional fixed values. /// +/// When used with different serialization formats, this is equivalent to [`serde_bytes`]. +/// /// See usage with below example: /// ``` /// # use apache_avro::AvroSchema; /// # use serde::{Deserialize, Serialize}; -/// /// #[derive(AvroSchema, Serialize, Deserialize)] /// struct StructWithBytes { /// #[avro(with)] @@ -271,11 +274,12 @@ pub mod fixed { /// This module is intended to be used through the Serde `with` attribute. /// Use [`apache_avro::serde::fixed`] for non-optional fixed values. /// +/// When used with different serialization formats, this is equivalent to [`serde_bytes`]. +/// /// See usage with below example: /// ``` /// # use apache_avro::AvroSchema; /// # use serde::{Deserialize, Serialize}; -/// /// #[derive(AvroSchema, Serialize, Deserialize)] /// struct StructWithBytes { /// #[avro(with)] @@ -348,11 +352,12 @@ pub mod fixed_opt { /// /// Use [`apache_avro::serde::slice_opt`] for optional bytes/fixed borrowed values. /// +/// When used with different serialization formats, this is equivalent to [`serde_bytes`]. +/// /// See usage with below example: -/// ```rust +/// ``` /// # use apache_avro::AvroSchema; /// # use serde::{Deserialize, Serialize}; -/// /// #[derive(AvroSchema, Serialize, Deserialize)] /// struct StructWithBytes<'a> { /// #[avro(with)] @@ -411,11 +416,12 @@ pub mod slice { /// /// Use [`apache_avro::serde::slice`] for non-optional bytes/fixed borrowed values. /// +/// When used with different serialization formats, this is equivalent to [`serde_bytes`]. +/// /// See usage with below example: /// ``` /// # use apache_avro::AvroSchema; /// # use serde::{Deserialize, Serialize}; -/// /// #[derive(AvroSchema, Serialize, Deserialize)] /// struct StructWithBytes<'a> { /// #[avro(with)] diff --git a/avro/src/types.rs b/avro/src/types.rs index e375f71..e33fd84 100644 --- a/avro/src/types.rs +++ b/avro/src/types.rs @@ -240,11 +240,10 @@ impl Record<'_> { } } - /// Put a compatible value (implementing the `ToAvro` trait) in the - /// `Record` for a given `field` name. + /// Add a field to the `Record`. /// - /// **NOTE** Only ensure that the field name is present in the `Schema` given when creating - /// this `Record`. Does not perform any schema validation. + // TODO: This should return an error at least panic + /// **NOTE**: If the field name does not exist in the schema, the value is silently dropped. pub fn put<V>(&mut self, field: &str, value: V) where V: Into<Value>, @@ -255,6 +254,7 @@ impl Record<'_> { } /// Get the value for a given field name. + /// /// Returns `None` if the field is not present in the schema pub fn get(&self, field: &str) -> Option<&Value> { self.schema_lookup diff --git a/avro/src/util.rs b/avro/src/util.rs index 748e923..8acbfa4 100644 --- a/avro/src/util.rs +++ b/avro/src/util.rs @@ -24,20 +24,21 @@ use std::{ sync::OnceLock, }; -/// Maximum number of bytes that can be allocated when decoding -/// Avro-encoded values. This is a protection against ill-formed -/// data, whose length field might be interpreted as enormous. -/// See max_allocation_bytes to change this limit. +/// Maximum number of bytes that can be allocated when decoding Avro-encoded values. +/// +/// This is a protection against ill-formed data, whose length field might be interpreted as enormous. +/// +/// See [`max_allocation_bytes`] to change this limit. pub const DEFAULT_MAX_ALLOCATION_BYTES: usize = 512 * 1024 * 1024; static MAX_ALLOCATION_BYTES: OnceLock<usize> = OnceLock::new(); -/// Whether to set serialization & deserialization traits -/// as `human_readable` or not. -/// See [set_serde_human_readable] to change this value. +/// Whether to set serialization & deserialization traits as `human_readable` or not. +/// +/// See [`set_serde_human_readable`] to change this value. +pub const DEFAULT_SERDE_HUMAN_READABLE: bool = false; +/// Whether the serializer and deserializer should indicate to types that the format is human-readable. // crate-visible for testing pub(crate) static SERDE_HUMAN_READABLE: OnceLock<bool> = OnceLock::new(); -/// Whether the serializer and deserializer should indicate to types that the format is human-readable. -pub const DEFAULT_SERDE_HUMAN_READABLE: bool = false; pub(crate) trait MapHelper { fn string(&self, key: &str) -> Option<String>; diff --git a/avro/src/validator.rs b/avro/src/validator.rs index 971870c..4623302 100644 --- a/avro/src/validator.rs +++ b/avro/src/validator.rs @@ -15,6 +15,39 @@ // specific language governing permissions and limitations // under the License. +//! # Custom name validation +//! +//! By default, the library follows the rules specified in the [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#names). +//! +//! Some of the other Apache Avro language SDKs are more flexible in their name validation. For +//! interoperability with those SDKs, the library provides a way to customize the name validation. +//! +//! ``` +//! use apache_avro::AvroResult; +//! use apache_avro::schema::Namespace; +//! use apache_avro::validator::{SchemaNameValidator, set_schema_name_validator}; +//! +//! struct MyCustomValidator; +//! +//! impl SchemaNameValidator for MyCustomValidator { +//! fn validate(&self, name: &str) -> AvroResult<(String, Namespace)> { +//! todo!() +//! } +//! } +//! +//! // don't parse any schema before registering the custom validator(s)! +//! +//! set_schema_name_validator(Box::new(MyCustomValidator)); +//! +//! // ... use the library +//! ``` +//! +//! Similar logic could be applied to the schema namespace, enum symbols and field names validation. +//! +//! **Note**: the library allows to set a validator only once per the application lifetime! +//! If the application parses schemas before setting a validator, the default validator will be +//! registered and used! + use crate::{AvroResult, error::Details, schema::Namespace}; use log::debug; use regex_lite::Regex; @@ -24,10 +57,12 @@ use std::sync::OnceLock; struct SpecificationValidator; /// A trait that validates schema names. -/// To register a custom one use [set_schema_name_validator]. +/// +/// To register a custom one use [`set_schema_name_validator`]. pub trait SchemaNameValidator: Send + Sync { - /// Returns the regex used to validate the schema name - /// according to the Avro specification. + /// The regex used to validate the schema name. + /// + /// The default implementation uses the Avro specified regex. fn regex(&self) -> &'static Regex { static SCHEMA_NAME_ONCE: OnceLock<Regex> = OnceLock::new(); SCHEMA_NAME_ONCE.get_or_init(|| { @@ -39,8 +74,9 @@ pub trait SchemaNameValidator: Send + Sync { }) } - /// Validates the schema name and returns the name and the optional namespace, - /// or [Details::InvalidSchemaName] if it is invalid. + /// Validates the schema name and returns the name and the optional namespace. + /// + /// Should return [`Details::InvalidSchemaName`] if it is invalid. fn validate(&self, schema_name: &str) -> AvroResult<(String, Namespace)>; } @@ -61,8 +97,7 @@ static NAME_VALIDATOR_ONCE: OnceLock<Box<dyn SchemaNameValidator + Send + Sync>> /// Sets a custom schema name validator. /// -/// Returns a unit if the registration was successful or the already -/// registered validator if the registration failed. +/// Returns `Err(validator)` if a validator is already configured. /// /// **Note**: This function must be called before parsing any schema because this will /// register the default validator and the registration is one time only! @@ -83,10 +118,12 @@ pub(crate) fn validate_schema_name(schema_name: &str) -> AvroResult<(String, Nam } /// A trait that validates schema namespaces. -/// To register a custom one use [set_schema_namespace_validator]. +/// +/// To register a custom one use [`set_schema_namespace_validator`]. pub trait SchemaNamespaceValidator: Send + Sync { - /// Returns the regex used to validate the schema namespace - /// according to the Avro specification. + /// The regex used to validate the schema namespace. + /// + /// The default implementation uses the Avro specified regex. fn regex(&self) -> &'static Regex { static NAMESPACE_ONCE: OnceLock<Regex> = OnceLock::new(); NAMESPACE_ONCE.get_or_init(|| { @@ -94,7 +131,9 @@ pub trait SchemaNamespaceValidator: Send + Sync { }) } - /// Validates the schema namespace or [Details::InvalidNamespace] if it is invalid. + /// Validates a schema namespace. + /// + /// Should return [`Details::InvalidNamespace`] if it is invalid. fn validate(&self, namespace: &str) -> AvroResult<()>; } @@ -114,8 +153,7 @@ static NAMESPACE_VALIDATOR_ONCE: OnceLock<Box<dyn SchemaNamespaceValidator + Sen /// Sets a custom schema namespace validator. /// -/// Returns a unit if the registration was successful or the already -/// registered validator if the registration failed. +/// Returns `Err(validator)` if a validator is already configured. /// /// **Note**: This function must be called before parsing any schema because this will /// register the default validator and the registration is one time only! @@ -135,17 +173,20 @@ pub(crate) fn validate_namespace(ns: &str) -> AvroResult<()> { } /// A trait that validates enum symbol names. -/// To register a custom one use [set_enum_symbol_name_validator]. +/// +/// To register a custom one use [`set_enum_symbol_name_validator`]. pub trait EnumSymbolNameValidator: Send + Sync { - /// Returns the regex used to validate the symbols of enum schema - /// according to the Avro specification. + /// The regex used to validate the symbols of enums. + /// + /// The default implementation uses the Avro specified regex. fn regex(&self) -> &'static Regex { static ENUM_SYMBOL_NAME_ONCE: OnceLock<Regex> = OnceLock::new(); ENUM_SYMBOL_NAME_ONCE.get_or_init(|| Regex::new(r"^[A-Za-z_][A-Za-z0-9_]*$").unwrap()) } - /// Validates the symbols of an Enum schema name and returns nothing (unit), - /// or [Details::EnumSymbolName] if it is invalid. + /// Validate the symbol of an enum. + /// + /// Should return [`Details::EnumSymbolName`] if it is invalid. fn validate(&self, name: &str) -> AvroResult<()>; } @@ -165,8 +206,7 @@ static ENUM_SYMBOL_NAME_VALIDATOR_ONCE: OnceLock<Box<dyn EnumSymbolNameValidator /// Sets a custom enum symbol name validator. /// -/// Returns a unit if the registration was successful or the already -/// registered validator if the registration failed. +/// Returns `Err(validator)` if a validator is already configured. /// /// **Note**: This function must be called before parsing any schema because this will /// register the default validator and the registration is one time only! @@ -186,17 +226,20 @@ pub(crate) fn validate_enum_symbol_name(symbol: &str) -> AvroResult<()> { } /// A trait that validates record field names. -/// To register a custom one use [set_record_field_name_validator]. +/// +/// To register a custom one use [`set_record_field_name_validator`]. pub trait RecordFieldNameValidator: Send + Sync { - /// Returns the regex used to validate the record field names - /// according to the Avro specification. + /// The regex used to validate the record field names. + /// + /// The default implementation uses the Avro specified regex. fn regex(&self) -> &'static Regex { static FIELD_NAME_ONCE: OnceLock<Regex> = OnceLock::new(); FIELD_NAME_ONCE.get_or_init(|| Regex::new(r"^[A-Za-z_][A-Za-z0-9_]*$").unwrap()) } - /// Validates the record field's names and returns nothing (unit), - /// or [Details::FieldName] if it is invalid. + /// Validate the name of a record field. + /// + /// Should return [`Details::FieldName`] if it is invalid. fn validate(&self, name: &str) -> AvroResult<()>; } @@ -216,8 +259,7 @@ static RECORD_FIELD_NAME_VALIDATOR_ONCE: OnceLock<Box<dyn RecordFieldNameValidat /// Sets a custom record field name validator. /// -/// Returns a unit if the registration was successful or the already -/// registered validator if the registration failed. +/// Returns `Err(validator)` if a validator is already configured. /// /// **Note**: This function must be called before parsing any schema because this will /// register the default validator and the registration is one time only! diff --git a/avro/src/writer.rs b/avro/src/writer.rs index 32808d1..1690259 100644 --- a/avro/src/writer.rs +++ b/avro/src/writer.rs @@ -178,7 +178,7 @@ impl<'a, W: Write> Writer<'a, W> { self.append_value(value) } - /// Append a compatible value to a `Writer`, also performing schema validation. + /// Append a value to the `Writer`, also performs schema validation. /// /// Returns the number of bytes written (it might be 0, see below). /// @@ -190,7 +190,7 @@ impl<'a, W: Write> Writer<'a, W> { self.append_value_ref(&avro) } - /// Append a compatible value to a `Writer`, also performing schema validation. + /// Append a compatible value to a `Writer`, also performs schema validation. /// /// Returns the number of bytes written (it might be 0, see below). /// @@ -287,8 +287,7 @@ impl<'a, W: Write> Writer<'a, W> { Ok(n) } - /// Extend a `Writer` with an `Iterator` of compatible values (implementing the `ToAvro` - /// trait), also performing schema validation. + /// Extend a `Writer` with an `Iterator` of values, also performs schema validation. /// /// Returns the number of bytes written. /// @@ -551,8 +550,7 @@ impl<W: Write> Drop for Writer<'_, W> { } } -/// Encode a compatible value (implementing the `ToAvro` trait) into Avro format, also performing -/// schema validation. +/// Encode a value into raw Avro data, also performs schema validation. /// /// This is an internal function which gets the bytes buffer where to write as parameter instead of /// creating a new one like `to_avro_datum`. @@ -767,8 +765,7 @@ fn write_value_ref_owned_resolved<W: Write>( ) } -/// Encode a compatible value (implementing the `ToAvro` trait) into Avro format, also -/// performing schema validation. +/// Encode a value into raw Avro data, also performs schema validation. /// /// **NOTE**: This function has a quite small niche of usage and does NOT generate headers and sync /// markers; use [`Writer`] to be fully Avro-compatible if you don't know what @@ -780,6 +777,7 @@ pub fn to_avro_datum<T: Into<Value>>(schema: &Schema, value: T) -> AvroResult<Ve } /// Write the referenced [Serialize]able object to the provided [Write] object. +/// /// Returns a result with the number of bytes written. /// /// **NOTE**: This function has a quite small niche of usage and does **NOT** generate headers and sync @@ -795,8 +793,8 @@ pub fn write_avro_datum_ref<T: Serialize, W: Write>( data.serialize(&mut serializer) } -/// Encode a compatible value (implementing the `ToAvro` trait) into Avro format, also -/// performing schema validation. +/// Encode a value into raw Avro data, also performs schema validation. +/// /// If the provided `schema` is incomplete then its dependencies must be /// provided in `schemata` pub fn to_avro_datum_schemata<T: Into<Value>>( diff --git a/avro/tests/io.rs b/avro/tests/io.rs index 5284426..dba7eef 100644 --- a/avro/tests/io.rs +++ b/avro/tests/io.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Port of https://github.com/apache/avro/blob/release-1.9.1/lang/py/test/test_io.py +//! Port of <https://github.com/apache/avro/blob/release-1.9.1/lang/py/test/test_io.py> use apache_avro::{Error, Schema, error::Details, from_avro_datum, to_avro_datum, types::Value}; use apache_avro_test_helper::TestResult; use pretty_assertions::assert_eq; diff --git a/avro_derive/build.rs b/avro_derive/build.rs index 54691a8..40e33af 100644 --- a/avro_derive/build.rs +++ b/avro_derive/build.rs @@ -18,7 +18,7 @@ //! Set the `nightly` cfg value on nightly toolchains. //! //! We would prefer to just do `#![rustversion::attr(nightly, feature(proc_macro_diagnostic)]` -//! but that's currently not possible, see https://github.com/dtolnay/rustversion/issues/8 +//! but that's currently not possible, see <https://github.com/dtolnay/rustversion/issues/8> #[rustversion::nightly] fn main() { diff --git a/avro_derive/src/case.rs b/avro_derive/src/case.rs index b5d5c38..c958562 100644 --- a/avro_derive/src/case.rs +++ b/avro_derive/src/case.rs @@ -30,24 +30,24 @@ pub enum RenameRule { /// Don't apply a default rename rule. #[default] None, - /// Rename direct children to "lowercase" style. + /// Rename direct children to `lowercase` style. LowerCase, - /// Rename direct children to "UPPERCASE" style. + /// Rename direct children to `UPPERCASE` style. UpperCase, - /// Rename direct children to "PascalCase" style, as typically used for + /// Rename direct children to `PascalCase` style, as typically used for /// enum variants. PascalCase, - /// Rename direct children to "camelCase" style. + /// Rename direct children to `camelCase` style. CamelCase, - /// Rename direct children to "snake_case" style, as commonly used for + /// Rename direct children to `snake_case` style, as commonly used for /// fields. SnakeCase, - /// Rename direct children to "SCREAMING_SNAKE_CASE" style, as commonly + /// Rename direct children to `SCREAMING_SNAKE_CASE` style, as commonly /// used for constants. ScreamingSnakeCase, - /// Rename direct children to "kebab-case" style. + /// Rename direct children to `kebab-case` style. KebabCase, - /// Rename direct children to "SCREAMING-KEBAB-CASE" style. + /// Rename direct children to `SCREAMING-KEBAB-CASE` style. ScreamingKebabCase, } diff --git a/avro_derive/src/lib.rs b/avro_derive/src/lib.rs index 3904452..0e1631f 100644 --- a/avro_derive/src/lib.rs +++ b/avro_derive/src/lib.rs @@ -17,6 +17,14 @@ #![cfg_attr(nightly, feature(proc_macro_diagnostic))] +//! This crate provides the `AvroSchema` derive macro. +//! ```no_run +//! #[derive(AvroSchema)] +//! ``` +//! Please see the documentation of the [`AvroSchema`] trait for instructions on how to use it. +//! +//! [`AvroSchema`]: https://docs.rs/apache-avro/latest/apache_avro/schema/trait.AvroSchema.html + mod attributes; mod case; diff --git a/avro_derive/tests/ui.rs b/avro_derive/tests/ui.rs index 5337257..9b9fd7d 100644 --- a/avro_derive/tests/ui.rs +++ b/avro_derive/tests/ui.rs @@ -17,7 +17,7 @@ /// These tests only run on nightly as the output can change per compiler version. /// -/// See https://github.com/dtolnay/trybuild/issues/84 +/// See <https://github.com/dtolnay/trybuild/issues/84> #[rustversion::attr(not(nightly), ignore)] #[test] fn ui() { diff --git a/avro_test_helper/src/lib.rs b/avro_test_helper/src/lib.rs index 6366a8e..9b5248d 100644 --- a/avro_test_helper/src/lib.rs +++ b/avro_test_helper/src/lib.rs @@ -53,7 +53,8 @@ fn after_all() { #[derive(Debug)] pub struct TestError; -/// A converter of any error into [TestError]. +/// A converter of any error into [`TestError`]. +/// /// It is used to print better error messages in the tests. /// Borrowed from <https://bluxte.net/musings/2023/01/08/improving_failure_messages_rust_tests/> // The Display bound is needed so that the `From` implementation doesn't
