diff options
Diffstat (limited to 'vendor/encode_unicode-0.3.6')
20 files changed, 5017 insertions, 0 deletions
diff --git a/vendor/encode_unicode-0.3.6/.cargo-checksum.json b/vendor/encode_unicode-0.3.6/.cargo-checksum.json new file mode 100644 index 00000000..75bdcfff --- /dev/null +++ b/vendor/encode_unicode-0.3.6/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"AUTHORS.md":"f2cf336738ad935a482a799be004083ddd07c904513caf80f9e48011888fe1b6","Cargo.toml":"6fd14a963bfb44b78883bb57c082300138668bc163c949286453836116c7018d","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"da23be69ad3ccf7a5823d62152efcd2b15de9482a5014fcb1b02844662d86abd","README.md":"c33df8cbe2645cd55d3644d4af453c0bd3cf3ffaa4a0c15ac6f3162fade966d6","RELEASES.md":"0a10f449adcf53ab00a43bb0242e38b508a51b529b3afc02eb645dbb0216b3bc","benches/multiiterators.rs":"69c878d010856a24247085356ed3045c6ceb1ac88cd75ea7a00b11206090debe","src/decoding_iterators.rs":"72c9fe0d10240e021dfc46546e814f04ab5c7e142a04305fad8c8da48575fe92","src/errors.rs":"0355e926edd1c8e81b537aca1a80fc324912a8c21e84db278c232860e3476822","src/lib.rs":"972010cd7f1b24dd048d066f1a3ff57fc16d4486a7e3583f7ae995dbd1ada5c8","src/traits.rs":"7ec1b649f23410e55bbfe6df13713040bcd292ee90a81d31437291ad100ea99f","src/utf16_char.rs":"c014de07ebc08592b3527e62d66699c8d637c8c9491835341ccdc71e28f346a1","src/utf16_iterators.rs":"9344132fb95077f05b6da8d9da77eb38ddfc8134e543313d74f286e7e545e875","src/utf8_char.rs":"349a3ebafa8ae2c88efa334958ea4d1863ca559d1b17aeb2c53113cd9afc7d15","src/utf8_iterators.rs":"e3d3bbb23253a582c48985b4eb26a4febfb40fc79a12e5fc1ca28c4cdef9fe81","tests/errs.rs":"7244966b93fc98c19a9ca163870863c1fba6107d526f8c2baa065696eb4cf9b4","tests/exhaustive.rs":"25c71761e57ac45125c2d527ddc5fc5e7b89b9b1055df3db50bce81af9844250","tests/iterators.rs":"1bda1ea031950134eef6b21e8473f4c7a2d338822cd40bb47267db4fe608d586","tests/oks.rs":"9c3e571488bc66696f7cd518f89e25bbb0ad382cbf2834f7461043237c42f6d9"},"package":"a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"}
\ No newline at end of file diff --git a/vendor/encode_unicode-0.3.6/AUTHORS.md b/vendor/encode_unicode-0.3.6/AUTHORS.md new file mode 100644 index 00000000..6759605b --- /dev/null +++ b/vendor/encode_unicode-0.3.6/AUTHORS.md @@ -0,0 +1,4 @@ +# The encode_unicode Developers + +* Torbjørn Birch Moltu +* Aljoscha Meyer diff --git a/vendor/encode_unicode-0.3.6/Cargo.toml b/vendor/encode_unicode-0.3.6/Cargo.toml new file mode 100644 index 00000000..618c9c74 --- /dev/null +++ b/vendor/encode_unicode-0.3.6/Cargo.toml @@ -0,0 +1,39 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "encode_unicode" +version = "0.3.6" +authors = ["Torbjørn Birch Moltu <t.b.moltu@lyse.net>"] +description = "UTF-8 and UTF-16 character types, iterators and related methods for char, u8 and u16.\n" +documentation = "https://docs.rs/encode_unicode/" +readme = "README.md" +keywords = ["unicode", "UTF-8", "UTF-16"] +categories = ["encoding", "no-std"] +license = "MIT/Apache-2.0" +repository = "https://github.com/tormol/encode_unicode" +[package.metadata.docs.rs] +features = ["ascii/std"] +[dependencies.ascii] +version = ">=0.8, <2" +optional = true +default-features = false + +[dependencies.clippy] +version = "0.*" +optional = true + +[features] +default = ["std"] +std = [] +[target."cfg(unix)".dev-dependencies.lazy_static] +version = "1.0.*" diff --git a/vendor/encode_unicode-0.3.6/LICENSE-APACHE b/vendor/encode_unicode-0.3.6/LICENSE-APACHE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/vendor/encode_unicode-0.3.6/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/encode_unicode-0.3.6/LICENSE-MIT b/vendor/encode_unicode-0.3.6/LICENSE-MIT new file mode 100644 index 00000000..de22e428 --- /dev/null +++ b/vendor/encode_unicode-0.3.6/LICENSE-MIT @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE diff --git a/vendor/encode_unicode-0.3.6/README.md b/vendor/encode_unicode-0.3.6/README.md new file mode 100644 index 00000000..b1bfc6c4 --- /dev/null +++ b/vendor/encode_unicode-0.3.6/README.md @@ -0,0 +1,59 @@ +# encode_unicode + +UTF-8 and UTF-16 character types, iterators and related methods for `char`, `u8` and `u16`. + +[![crates.io page](https://img.shields.io/crates/v/encode_unicode.svg)](https://crates.io/crates/encode_unicode/) + +## Features + +* **[`Utf8Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf8Char.html)**: + A `char` stored as UTF-8. Can be borrowed as a `str` or `u8` slice. +* **[`Utf16Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf16Char.html)**: + A `char` stored as UTF-16. Can be borrowed as an `u16` slice. +* [Conversion methods on `char`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.CharExt.html): + * to and from UTF-8 as `[u8; 4]` or slice. + * to and from UTF-16 as `(u16, Option<u16>)` or slice. +* [Iterator adapters](https://docs.rs/encode_unicode/latest/encode_unicode/trait.IterExt.html) + for converting betwenn `u8`s and `Utf8Char`s or `u16`s and `Utf16Char`s. +* Optimized [slice-based decoding iterators](https://docs.rs/encode_unicode/latest/encode_unicode/trait.SliceExt.html). +* [Precise errors when decoding a char from UTF-8, UTF-16 or `u32` fails](http://docs.rs/encode_unicode/latest/encode_unicode/error/index.html). +* Utility methods on [`u8`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U8UtfExt.html) + and [`u16`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U16UtfExt.html). + +The minimum supported version of Rust is 1.15, +older versions might work now but can break with a minor update. + +## Optional features + +* `#![no_std]`-mode: There are a few differences: + * `Error` doesn't exist, but `description()` is made available as an inherent impl. + * `Extend`/`FromIterator`-implementations for `String`/`Vec<u8>`/`Vec<u16>` are missing. + * There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`. + This feature is enabled by setting `default-features=false` in `Cargo.toml`: + `encode_unicode = {version="0.3.4", default-features=false}`. +* Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate: + Convert `Utf8Char` and `Utf16Char` to and from [ascii::`AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html). + +## License + +Licensed under either of + +* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) + +at your option. + +### Contribution + +Unless you explicitly state otherwise, any contribution intentionally +submitted for inclusion in the work by you, as defined in the Apache-2.0 +license, shall be dual licensed as above, without any additional terms or +conditions. + +## History + +The original purpose of this crate was to provide standins for the then +unstable `encode_utf8()` and `encode_utf16()`. +The standins were removed in 0.3 when Rust 1.15 stabilized the `encode_` +methods, but the other stuff I added, such as iterators like +those `encode_utf{8,16}() returned for a while, might still be of use. diff --git a/vendor/encode_unicode-0.3.6/RELEASES.md b/vendor/encode_unicode-0.3.6/RELEASES.md new file mode 100644 index 00000000..371f8bf8 --- /dev/null +++ b/vendor/encode_unicode-0.3.6/RELEASES.md @@ -0,0 +1,74 @@ +Version 0.3.6 (2019-08-23) +========================== +* Fix pointless undefined behavior in `Utf16Char.to_ascii_char()` (which is part of ascii feature) +* Widen ascii version requirement to include 1.* +* Add `[u16; 2]` UTF-16 array alternatives to `(u16, Some(u16))` UTF-16 tuple methods +* Add `Utf16Char.is_bmp()` + +Version 0.3.5 (2018-10-23) +========================== +* Fix docs.rs build failure + +Version 0.3.4 (2018-10-23) +========================== +* Fix UB in UTF-8 validation which lead to invalid codepoints being accepted in release mode +* Add fallible decoding iterator adapters `Utf8CharMerger` and `Utf16CharMerger` + and slice-based iterators `Utf8CharDecoder` and `Utf16CharDecoder` +* Widen ascii version requirement from 0.8.* to 0.8.0 - 0.10.* +* Implement creating / extending `String`s from `Utf16Char`-producing iterators + +Version 0.3.3 (2018-10-16) +========================== +* Fix UTF-8 overlong check. (`from_array()` and `from_slice()` accepted two-byte encodings of ASCII characters >= '@', which includes all letters) +* Implement `FromStr` for `Utf16Char` +* Add `from_str_start()` to `Utf8Char` and `Utf16Char` +* Add `Utf{8,16}Char{s,Indices}`: `str`-based iterators for `Utf8Char` and `Utf16Char` equivalent to `char`'s `Chars` and `CharIndices`. +* Add `StrExt` with functions to create the above iterators. +* Implement `FromIterator` and `Extend` for `Vec<{u8,u16}>` with reference-producing `Utf{8,16}Char` iterators too. +* Add `Utf8CharSplitter` and `Utf16CharSplitter`: `Utf{8,16}Char`-to-`u{8,16}` iterator adapters. +* Add `IterExt`, `iter_bytes()` and `iter_units()` to create the above splitting iterators. +* Add `Utf8Char::from_ascii()`, `Utf16Char::from_bmp()` with `_unchecked` versions of both. +* Add cross-type `PartialEq` and `PartialOrd` implementations. +* Change the `description()` for a few error types. + +Version 0.3.2 (2018-08-08) +========================== +* Hide `AsciiExt` deprecation warning and add replacement methods. +* Correct documentation for `U8UtfExt::extra_utf8_bytes()`. +* Fix misspellings in some error descriptions. +* Avoid potentially bad transmutes. + +Version 0.3.1 (2017-06-16) +========================== +* Implement `Display` for `Utf8Char` and `Utf16Char`. + +Version 0.3.0 (2017-03-29) +========================== +* Replace the "no_std" feature with opt-out "std". + * Upgrade ascii to v0.8. + * Make tests compile on stable. +* Remove `CharExt::write_utf{8,16}()` because `encode_utf{8,16}()` has been stabilized. +* Return a proper error from `U16UtfExt::utf16_needs_extra_unit()` instead of `None`. +* Rename `U16UtfExt::utf_is_leading_surrogate()` to `is_utf16_leading_surrogate()`. +* Rename `Utf16Char::from_slice()` to `from_slice_start()` and `CharExt::from_utf{8,16}_slice()` + to `from_utf{8,16}_slice_start()` to be consistent with `Utf8Char`. +* Fix a bug where `CharExt::from_slice()` would accept some trailing surrogates + as standalone codepoints. + +Version 0.2.0 (2016-07-24) +========================== +* Change `CharExt::write_utf{8,16}()` to panic instead of returning `None` + if the slice is too short. +* Fix bug where `CharExt::write_utf8()` and `Utf8Char::to_slice()` could change bytes it shouldn't. +* Rename lots of errors with search and replace: + * CodePoint -> Codepoint + * Several -> Multiple +* Update the ascii feature to use [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) v0.7. +* Support `#[no_std]`; see 70e090ee for differences. +* Ungate impls of `AsciiExt`. (doesn't require ascii or nightly) +* Make the tests compile (and pass) again. + (They still require nightly). + +Version 0.1.* (2016-04-07) +========================== +First release. diff --git a/vendor/encode_unicode-0.3.6/benches/multiiterators.rs b/vendor/encode_unicode-0.3.6/benches/multiiterators.rs new file mode 100644 index 00000000..22c3eed1 --- /dev/null +++ b/vendor/encode_unicode-0.3.6/benches/multiiterators.rs @@ -0,0 +1,93 @@ +// uses /usr/share/dict/ for text to convert to Vec<Utf*Char> and iterate over +#![cfg(all(unix, feature="std"))] +#![feature(test)] +extern crate test; +use test::{Bencher, black_box}; +#[macro_use] extern crate lazy_static; +extern crate encode_unicode; +use encode_unicode::{CharExt, Utf8Char, Utf16Char, iter_bytes, iter_units}; + +static ENGLISH: &str = include_str!("/usr/share/dict/american-english"); +// TODO find a big chinese file; `aptitude search '?provides(wordlist)'` didn't have one +lazy_static!{ + static ref UTF8CHARS: Vec<Utf8Char> = ENGLISH.chars().map(|c| c.to_utf8() ).collect(); + static ref UTF16CHARS: Vec<Utf16Char> = ENGLISH.chars().map(|c| c.to_utf16() ).collect(); +} + + +#[bench] +fn utf16_split_all_single_mulititerator(b: &mut Bencher) { + b.iter(|| { + iter_units(black_box(&*UTF16CHARS)).for_each(|u| assert!(u != 0) ); + }); +} +#[bench] +fn utf16_split_all_single_flatmap(b: &mut Bencher) { + b.iter(|| { + black_box(&*UTF16CHARS).iter().flat_map(|&u16c| u16c ).for_each(|u| assert!(u != 0) ); + }); +} +#[bench] +fn utf16_split_all_single_cloned_flatten(b: &mut Bencher) { + b.iter(|| { + black_box(&*UTF16CHARS).iter().cloned().flatten().for_each(|u| assert!(u != 0) ); + }); +} + + +#[bench] +fn utf8_split_mostly_ascii_multiiterator(b: &mut Bencher) { + b.iter(|| { + iter_bytes(black_box(&*UTF8CHARS)).for_each(|b| assert!(b != 0) ); + }); +} +#[bench] +fn utf8_split_mostly_ascii_flatmap(b: &mut Bencher) { + b.iter(|| { + black_box(&*UTF8CHARS).iter().flat_map(|&u8c| u8c ).for_each(|b| assert!(b != 0) ); + }); +} +#[bench] +fn utf8_split_mostly_ascii_cloned_flatten(b: &mut Bencher) { + b.iter(|| { + black_box(&*UTF8CHARS).iter().cloned().flatten().for_each(|b| assert!(b != 0) ); + }); +} + + +#[bench] +fn utf8_extend_mostly_ascii_multiiterator(b: &mut Bencher) { + b.iter(|| { + let vec: Vec<u8> = iter_bytes(black_box(&*UTF8CHARS)).collect(); + assert_eq!(black_box(vec).len(), ENGLISH.len()); + }); +} +#[bench] +fn utf8_extend_mostly_ascii_custom(b: &mut Bencher) { + b.iter(|| { + let vec: Vec<u8> = black_box(&*UTF8CHARS).iter().collect(); + assert_eq!(black_box(vec).len(), ENGLISH.len()); + }); +} +#[bench] +fn utf8_extend_mostly_ascii_custom_str(b: &mut Bencher) { + b.iter(|| { + let vec: String = black_box(&*UTF8CHARS).iter().cloned().collect(); + assert_eq!(black_box(vec).len(), ENGLISH.len()); + }); +} + +#[bench] +fn utf16_extend_all_single_multiiterator(b: &mut Bencher) { + b.iter(|| { + let vec: Vec<u16> = iter_units(black_box(&*UTF16CHARS)).collect(); + assert!(black_box(vec).len() < ENGLISH.len()); + }); +} +#[bench] +fn utf16_extend_all_single_custom(b: &mut Bencher) { + b.iter(|| { + let vec: Vec<u16> = black_box(&*UTF16CHARS).iter().collect(); + assert!(black_box(vec).len() < ENGLISH.len()); + }); +} diff --git a/vendor/encode_unicode-0.3.6/src/decoding_iterators.rs b/vendor/encode_unicode-0.3.6/src/decoding_iterators.rs new file mode 100644 index 00000000..4ef41250 --- /dev/null +++ b/vendor/encode_unicode-0.3.6/src/decoding_iterators.rs @@ -0,0 +1,494 @@ +/* Copyright 2018 The encode_unicode Developers + * + * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or + * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or + * http://opensource.org/licenses/MIT>, at your option. This file may not be + * copied, modified, or distributed except according to those terms. + */ + +//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail. +//! +//! To be predictable, all errors consume one element each. +//! +//! The iterator adaptors produce neither offset nor element length to work +//! well with other adaptors, +//! while the slice iterators yield both to make more advanced use cases easy. + +use errors::{InvalidUtf8Slice, InvalidUtf16FirstUnit, Utf16PairError}; +use errors::InvalidUtf8Slice::*; +use errors::InvalidUtf8::*; +use errors::InvalidUtf8FirstByte::*; +use errors::InvalidUtf16Slice::*; +use errors::InvalidCodepoint::*; +use errors::Utf16PairError::*; +use utf8_char::Utf8Char; +use utf16_char::Utf16Char; +use traits::U16UtfExt; +extern crate core; +use self::core::borrow::Borrow; +use self::core::fmt::{self, Debug}; +use self::core::iter::Chain; +use self::core::option; + + +/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s. +/// +/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars) +/// for examples and error handling. +#[derive(Clone, Default)] +pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> { + iter: I, + /// number of bytes that were read before an error was detected + after_err_leftover: u8, + /// stack because it simplifies popping. + after_err_stack: [u8; 3], +} +impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>> +From<T> for Utf8CharMerger<B, I> { + fn from(t: T) -> Self { + Utf8CharMerger { + iter: t.into_iter(), + after_err_leftover: 0, + after_err_stack: [0; 3], + } + } +} +impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> { + /// Extract the inner iterator. + /// + /// If the last item produced by `.next()` was an `Err`, + /// up to three following bytes might be missing. + /// The exact number of missing bytes for each error type should not be relied on. + /// + /// # Examples + /// + /// Three bytes swallowed: + /// ``` + /// # use encode_unicode::IterExt; + /// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars(); + /// assert!(merger.next().unwrap().is_err()); + /// let mut inner: std::slice::Iter<u8> = merger.into_inner(); + /// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared + /// ``` + /// + /// All bytes present: + /// ``` + /// # use encode_unicode::IterExt; + /// let mut merger = b"\xb0FS".iter().to_utf8chars(); + /// assert!(merger.next().unwrap().is_err()); + /// assert_eq!(merger.into_inner().next(), Some(&b'F')); + /// ``` + /// + /// Two bytes missing: + /// ``` + /// # use encode_unicode::IterExt; + /// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars(); + /// assert!(merger.next().unwrap().is_err()); + /// assert_eq!(merger.into_inner().next(), Some(&b'F')); + /// ``` + pub fn into_inner(self) -> I { + self.iter + } + + fn save(&mut self, bytes: &[u8;4], len: usize) { + // forget bytes[0] and push the others onto self.after_err_stack (in reverse). + for &after_err in bytes[1..len].iter().rev() { + self.after_err_stack[self.after_err_leftover as usize] = after_err; + self.after_err_leftover += 1; + } + } + /// Reads len-1 bytes into bytes[1..] + fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),InvalidUtf8Slice> { + // This is the only function that pushes onto after_err_stack, + // and it checks that all bytes are continuation bytes before fetching the next one. + // Therefore only the last byte retrieved can be a non-continuation byte. + // That last byte is also the last to be retrieved from after_err. + // + // Before this function is called, there has been retrieved at least one byte. + // If that byte was a continuation byte, next() produces an error + // and won't call this function. + // Therefore, we know that after_err is empty at this point. + // This means that we can use self.iter directly, and knows where to start pushing + debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack); + for i in 1..len { + if let Some(extra) = self.iter.next() { + let extra = *extra.borrow(); + bytes[i] = extra; + if extra & 0b1100_0000 != 0b1000_0000 { + // not a continuation byte + self.save(bytes, i+1); + return Err(InvalidUtf8Slice::Utf8(NotAContinuationByte(i))) + } + } else { + self.save(bytes, i); + return Err(TooShort(len)); + } + } + Ok(()) + } +} +impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> { + type Item = Result<Utf8Char,InvalidUtf8Slice>; + fn next(&mut self) -> Option<Self::Item> { + let first: u8; + if self.after_err_leftover != 0 { + self.after_err_leftover -= 1; + first = self.after_err_stack[self.after_e |