diff options
author | Martin Nordholts <enselic@gmail.com> | 2021-09-09 20:52:33 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-09-09 20:52:33 +0200 |
commit | 9124271eaf237519f9381b78681d71113e308a58 (patch) | |
tree | edfff3277762b04745d41a2aa6371e8aa51ba2bf /src/assets.rs | |
parent | 156dec2737db4759f41a30650cfe938de3c65d5c (diff) |
Load independent and minimal syntax sets when using --language (#1787)
This significantly speeds up the startup time of bat, since only a single
linked SyntaxDefinition is loaded for each file. The size increase of the
binary is just ~400 kB.
In order for startup time to be improved, the --language arg must be used, and
it must match one of the following names:
"Plain Text", "ActionScript", "AppleScript", "Batch File", "NAnt Build File",
"C#", "C", "CSS", "D", "Diff", "Erlang", "Go", "Haskell", "JSON", "Java
Properties", "BibTeX", "LaTeX Log", "TeX", "Lisp", "Lua", "MATLAB", "Pascal",
"R", "Regular Expression", "Rust", "SQL", "Scala", "Tcl", "XML", "YAML", "Apache
Conf", "ARM Assembly", "Assembly (x86_64)", "CMakeCache", "Comma Separated
Values", "Cabal", "CoffeeScript", "CpuInfo", "Dart Analysis Output", "Dart",
"Dockerfile", "DotENV", "F#", "Friendly Interactive Shell (fish)", "Fortran
(Fixed Form)", "Fortran (Modern)", "Fortran Namelist", "fstab", "GLSL",
"GraphQL", "Groff/troff", "group", "hosts", "INI", "Jinja2", "jsonnet",
"Kotlin", "Less", "LLVM", "Lean", "MemInfo", "Nim", "Ninja", "Nix", "passwd",
"PowerShell", "Protocol Buffer (TEXT)", "Puppet", "Rego", "resolv", "Robot
Framework", "SML", "Strace", "Stylus", "Solidity", "Vyper", "Swift",
"SystemVerilog", "TOML", "Terraform", "TypeScript", "TypeScriptReact",
"Verilog", "VimL", "Zig", "gnuplot", "log", "requirements.txt", "Highlight
non-printables", "Private Key", "varlink"
Later commits will improve startup time for more code paths.
* fix some typos and misspellings
* CHANGELOG.md: Add Performance section (preliminary)
* Add a CHANGELOG.md entry for this PR
Diffstat (limited to 'src/assets.rs')
-rw-r--r-- | src/assets.rs | 98 |
1 files changed, 96 insertions, 2 deletions
diff --git a/src/assets.rs b/src/assets.rs index 26de12e7..e315a402 100644 --- a/src/assets.rs +++ b/src/assets.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::ffi::OsStr; use std::fs; use std::path::{Path, PathBuf}; @@ -18,6 +19,14 @@ use crate::syntax_mapping::{MappingTarget, SyntaxMapping}; pub struct HighlightingAssets { syntax_set_cell: LazyCell<SyntaxSet>, serialized_syntax_set: SerializedSyntaxSet, + + minimal_syntaxes: MinimalSyntaxes, + + /// Lazily load serialized [SyntaxSet]s from [Self.minimal_syntaxes]. The + /// index in this vec matches the index in + /// [Self.minimal_syntaxes.serialized_syntax_sets] + deserialized_minimal_syntaxes: Vec<LazyCell<SyntaxSet>>, + theme_set: ThemeSet, fallback_theme: Option<&'static str>, } @@ -28,12 +37,39 @@ pub struct SyntaxReferenceInSet<'a> { pub syntax_set: &'a SyntaxSet, } +/// Stores and allows lookup of minimal [SyntaxSet]s. The [SyntaxSet]s are +/// stored in serialized form, and are deserialized on-demand. This gives good +/// startup performance since only the necessary [SyntaxReference]s needs to be +/// deserialized. +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub(crate) struct MinimalSyntaxes { + /// Lookup the index into `serialized_syntax_sets` of a [SyntaxSet] by the + /// name of any [SyntaxReference] inside the [SyntaxSet] + /// (We will later add `by_extension`, `by_first_line`, etc.) + pub(crate) by_name: HashMap<String, usize>, + + /// Serialized [SyntaxSet]s. Whether or not this data is compressed is + /// decided by [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES] + pub(crate) serialized_syntax_sets: Vec<Vec<u8>>, +} + // Compress for size of ~700 kB instead of ~4600 kB at the cost of ~30% longer deserialization time pub(crate) const COMPRESS_SYNTAXES: bool = true; // Compress for size of ~20 kB instead of ~200 kB at the cost of ~30% longer deserialization time pub(crate) const COMPRESS_THEMES: bool = true; +// Compress for size of ~400 kB instead of ~2100 kB at the cost of ~30% longer deserialization time +pub(crate) const COMPRESS_SERIALIZED_MINIMAL_SYNTAXES: bool = true; + +// Whether or not to compress the serialized form of [MinimalSyntaxes]. Shall +// always be `false`, because the data in +// [MinimalSyntaxes.serialized_syntax_sets] has already been compressed +// (assuming [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES] is `true`). The "outer" data +// structures like `by_name` are tiny. If we compress, deserialization can't do +// efficient byte-by-byte copy of `serialized_syntax_sets`. +pub(crate) const COMPRESS_MINIMAL_SYNTAXES: bool = false; + const IGNORED_SUFFIXES: [&str; 13] = [ // Editor etc backups "~", @@ -55,10 +91,20 @@ const IGNORED_SUFFIXES: [&str; 13] = [ ]; impl HighlightingAssets { - fn new(serialized_syntax_set: SerializedSyntaxSet, theme_set: ThemeSet) -> Self { + fn new( + serialized_syntax_set: SerializedSyntaxSet, + minimal_syntaxes: MinimalSyntaxes, + theme_set: ThemeSet, + ) -> Self { + // Prepare so we can lazily load minimal syntaxes without a mut reference + let deserialized_minimal_syntaxes = + vec![LazyCell::new(); minimal_syntaxes.serialized_syntax_sets.len()]; + HighlightingAssets { syntax_set_cell: LazyCell::new(), serialized_syntax_set, + deserialized_minimal_syntaxes, + minimal_syntaxes, theme_set, fallback_theme: None, } @@ -71,6 +117,11 @@ impl HighlightingAssets { pub fn from_cache(cache_path: &Path) -> Result<Self> { Ok(HighlightingAssets::new( SerializedSyntaxSet::FromFile(cache_path.join("syntaxes.bin")), + asset_from_cache( + &cache_path.join("minimal_syntaxes.bin"), + "minimal syntax sets", + COMPRESS_MINIMAL_SYNTAXES, + )?, asset_from_cache(&cache_path.join("themes.bin"), "theme set", COMPRESS_THEMES)?, )) } @@ -78,6 +129,7 @@ impl HighlightingAssets { pub fn from_binary() -> Self { HighlightingAssets::new( SerializedSyntaxSet::FromBinary(get_serialized_integrated_syntaxset()), + get_integrated_minimal_syntaxes(), get_integrated_themeset(), ) } @@ -111,6 +163,41 @@ impl HighlightingAssets { self.get_theme_set().themes.keys().map(|s| s.as_ref()) } + /// Finds a [SyntaxSet] that contains a [SyntaxReference] by its name. First + /// tries to find a minimal [SyntaxSet]. If none is found, returns the + /// [SyntaxSet] that contains all syntaxes. + fn get_syntax_set_by_name(&self, name: &str) -> Result<&SyntaxSet> { + let minimal_syntax_set = self + .minimal_syntaxes + .by_name + .get(&name.to_ascii_lowercase()) + .and_then(|index| self.get_minimal_syntax_set_with_index(*index)); + + match minimal_syntax_set { + Some(syntax_set) => Ok(syntax_set), + None => self.get_syntax_set(), + } + } + + fn load_minimal_syntax_set_with_index(&self, index: usize) -> Result<SyntaxSet> { + let serialized_syntax_set = &self.minimal_syntaxes.serialized_syntax_sets[index]; + asset_from_contents( + &serialized_syntax_set[..], + &format!("minimal syntax set {}", index), + COMPRESS_SERIALIZED_MINIMAL_SYNTAXES, + ) + .map_err(|_| format!("Could not parse minimal syntax set {}", index).into()) + } + + fn get_minimal_syntax_set_with_index(&self, index: usize) -> Option<&SyntaxSet> { + self.deserialized_minimal_syntaxes + .get(index) + .and_then(|cell| { + cell.try_borrow_with(|| self.load_minimal_syntax_set_with_index(index)) + .ok() + }) + } + /// Use [Self::get_syntax_for_file_name] instead #[deprecated] pub fn syntax_for_file_name( @@ -167,7 +254,7 @@ impl HighlightingAssets { mapping: &SyntaxMapping, ) -> Result<SyntaxReferenceInSet> { if let Some(language) = language { - let syntax_set = self.get_syntax_set()?; + let syntax_set = self.get_syntax_set_by_name(language)?; syntax_set .find_syntax_by_token(language) .map(|syntax| SyntaxReferenceInSet { syntax, syntax_set }) @@ -320,6 +407,13 @@ pub(crate) fn get_integrated_themeset() -> ThemeSet { from_binary(include_bytes!("../assets/themes.bin"), COMPRESS_THEMES) } +fn get_integrated_minimal_syntaxes() -> MinimalSyntaxes { + from_binary( + include_bytes!("../assets/minimal_syntaxes.bin"), + COMPRESS_MINIMAL_SYNTAXES, + ) +} + pub(crate) fn from_binary<T: serde::de::DeserializeOwned>(v: &[u8], compressed: bool) -> T { asset_from_contents(v, "n/a", compressed) .expect("data integrated in binary is never faulty, but make sure `compressed` is in sync!") |