summaryrefslogtreecommitdiffstats
path: root/src/assets.rs
diff options
context:
space:
mode:
authorMartin Nordholts <enselic@gmail.com>2021-09-09 20:52:33 +0200
committerGitHub <noreply@github.com>2021-09-09 20:52:33 +0200
commit9124271eaf237519f9381b78681d71113e308a58 (patch)
treeedfff3277762b04745d41a2aa6371e8aa51ba2bf /src/assets.rs
parent156dec2737db4759f41a30650cfe938de3c65d5c (diff)
Load independent and minimal syntax sets when using --language (#1787)
This significantly speeds up the startup time of bat, since only a single linked SyntaxDefinition is loaded for each file. The size increase of the binary is just ~400 kB. In order for startup time to be improved, the --language arg must be used, and it must match one of the following names: "Plain Text", "ActionScript", "AppleScript", "Batch File", "NAnt Build File", "C#", "C", "CSS", "D", "Diff", "Erlang", "Go", "Haskell", "JSON", "Java Properties", "BibTeX", "LaTeX Log", "TeX", "Lisp", "Lua", "MATLAB", "Pascal", "R", "Regular Expression", "Rust", "SQL", "Scala", "Tcl", "XML", "YAML", "Apache Conf", "ARM Assembly", "Assembly (x86_64)", "CMakeCache", "Comma Separated Values", "Cabal", "CoffeeScript", "CpuInfo", "Dart Analysis Output", "Dart", "Dockerfile", "DotENV", "F#", "Friendly Interactive Shell (fish)", "Fortran (Fixed Form)", "Fortran (Modern)", "Fortran Namelist", "fstab", "GLSL", "GraphQL", "Groff/troff", "group", "hosts", "INI", "Jinja2", "jsonnet", "Kotlin", "Less", "LLVM", "Lean", "MemInfo", "Nim", "Ninja", "Nix", "passwd", "PowerShell", "Protocol Buffer (TEXT)", "Puppet", "Rego", "resolv", "Robot Framework", "SML", "Strace", "Stylus", "Solidity", "Vyper", "Swift", "SystemVerilog", "TOML", "Terraform", "TypeScript", "TypeScriptReact", "Verilog", "VimL", "Zig", "gnuplot", "log", "requirements.txt", "Highlight non-printables", "Private Key", "varlink" Later commits will improve startup time for more code paths. * fix some typos and misspellings * CHANGELOG.md: Add Performance section (preliminary) * Add a CHANGELOG.md entry for this PR
Diffstat (limited to 'src/assets.rs')
-rw-r--r--src/assets.rs98
1 files changed, 96 insertions, 2 deletions
diff --git a/src/assets.rs b/src/assets.rs
index 26de12e7..e315a402 100644
--- a/src/assets.rs
+++ b/src/assets.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
use std::ffi::OsStr;
use std::fs;
use std::path::{Path, PathBuf};
@@ -18,6 +19,14 @@ use crate::syntax_mapping::{MappingTarget, SyntaxMapping};
pub struct HighlightingAssets {
syntax_set_cell: LazyCell<SyntaxSet>,
serialized_syntax_set: SerializedSyntaxSet,
+
+ minimal_syntaxes: MinimalSyntaxes,
+
+ /// Lazily load serialized [SyntaxSet]s from [Self.minimal_syntaxes]. The
+ /// index in this vec matches the index in
+ /// [Self.minimal_syntaxes.serialized_syntax_sets]
+ deserialized_minimal_syntaxes: Vec<LazyCell<SyntaxSet>>,
+
theme_set: ThemeSet,
fallback_theme: Option<&'static str>,
}
@@ -28,12 +37,39 @@ pub struct SyntaxReferenceInSet<'a> {
pub syntax_set: &'a SyntaxSet,
}
+/// Stores and allows lookup of minimal [SyntaxSet]s. The [SyntaxSet]s are
+/// stored in serialized form, and are deserialized on-demand. This gives good
+/// startup performance since only the necessary [SyntaxReference]s needs to be
+/// deserialized.
+#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
+pub(crate) struct MinimalSyntaxes {
+ /// Lookup the index into `serialized_syntax_sets` of a [SyntaxSet] by the
+ /// name of any [SyntaxReference] inside the [SyntaxSet]
+ /// (We will later add `by_extension`, `by_first_line`, etc.)
+ pub(crate) by_name: HashMap<String, usize>,
+
+ /// Serialized [SyntaxSet]s. Whether or not this data is compressed is
+ /// decided by [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES]
+ pub(crate) serialized_syntax_sets: Vec<Vec<u8>>,
+}
+
// Compress for size of ~700 kB instead of ~4600 kB at the cost of ~30% longer deserialization time
pub(crate) const COMPRESS_SYNTAXES: bool = true;
// Compress for size of ~20 kB instead of ~200 kB at the cost of ~30% longer deserialization time
pub(crate) const COMPRESS_THEMES: bool = true;
+// Compress for size of ~400 kB instead of ~2100 kB at the cost of ~30% longer deserialization time
+pub(crate) const COMPRESS_SERIALIZED_MINIMAL_SYNTAXES: bool = true;
+
+// Whether or not to compress the serialized form of [MinimalSyntaxes]. Shall
+// always be `false`, because the data in
+// [MinimalSyntaxes.serialized_syntax_sets] has already been compressed
+// (assuming [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES] is `true`). The "outer" data
+// structures like `by_name` are tiny. If we compress, deserialization can't do
+// efficient byte-by-byte copy of `serialized_syntax_sets`.
+pub(crate) const COMPRESS_MINIMAL_SYNTAXES: bool = false;
+
const IGNORED_SUFFIXES: [&str; 13] = [
// Editor etc backups
"~",
@@ -55,10 +91,20 @@ const IGNORED_SUFFIXES: [&str; 13] = [
];
impl HighlightingAssets {
- fn new(serialized_syntax_set: SerializedSyntaxSet, theme_set: ThemeSet) -> Self {
+ fn new(
+ serialized_syntax_set: SerializedSyntaxSet,
+ minimal_syntaxes: MinimalSyntaxes,
+ theme_set: ThemeSet,
+ ) -> Self {
+ // Prepare so we can lazily load minimal syntaxes without a mut reference
+ let deserialized_minimal_syntaxes =
+ vec![LazyCell::new(); minimal_syntaxes.serialized_syntax_sets.len()];
+
HighlightingAssets {
syntax_set_cell: LazyCell::new(),
serialized_syntax_set,
+ deserialized_minimal_syntaxes,
+ minimal_syntaxes,
theme_set,
fallback_theme: None,
}
@@ -71,6 +117,11 @@ impl HighlightingAssets {
pub fn from_cache(cache_path: &Path) -> Result<Self> {
Ok(HighlightingAssets::new(
SerializedSyntaxSet::FromFile(cache_path.join("syntaxes.bin")),
+ asset_from_cache(
+ &cache_path.join("minimal_syntaxes.bin"),
+ "minimal syntax sets",
+ COMPRESS_MINIMAL_SYNTAXES,
+ )?,
asset_from_cache(&cache_path.join("themes.bin"), "theme set", COMPRESS_THEMES)?,
))
}
@@ -78,6 +129,7 @@ impl HighlightingAssets {
pub fn from_binary() -> Self {
HighlightingAssets::new(
SerializedSyntaxSet::FromBinary(get_serialized_integrated_syntaxset()),
+ get_integrated_minimal_syntaxes(),
get_integrated_themeset(),
)
}
@@ -111,6 +163,41 @@ impl HighlightingAssets {
self.get_theme_set().themes.keys().map(|s| s.as_ref())
}
+ /// Finds a [SyntaxSet] that contains a [SyntaxReference] by its name. First
+ /// tries to find a minimal [SyntaxSet]. If none is found, returns the
+ /// [SyntaxSet] that contains all syntaxes.
+ fn get_syntax_set_by_name(&self, name: &str) -> Result<&SyntaxSet> {
+ let minimal_syntax_set = self
+ .minimal_syntaxes
+ .by_name
+ .get(&name.to_ascii_lowercase())
+ .and_then(|index| self.get_minimal_syntax_set_with_index(*index));
+
+ match minimal_syntax_set {
+ Some(syntax_set) => Ok(syntax_set),
+ None => self.get_syntax_set(),
+ }
+ }
+
+ fn load_minimal_syntax_set_with_index(&self, index: usize) -> Result<SyntaxSet> {
+ let serialized_syntax_set = &self.minimal_syntaxes.serialized_syntax_sets[index];
+ asset_from_contents(
+ &serialized_syntax_set[..],
+ &format!("minimal syntax set {}", index),
+ COMPRESS_SERIALIZED_MINIMAL_SYNTAXES,
+ )
+ .map_err(|_| format!("Could not parse minimal syntax set {}", index).into())
+ }
+
+ fn get_minimal_syntax_set_with_index(&self, index: usize) -> Option<&SyntaxSet> {
+ self.deserialized_minimal_syntaxes
+ .get(index)
+ .and_then(|cell| {
+ cell.try_borrow_with(|| self.load_minimal_syntax_set_with_index(index))
+ .ok()
+ })
+ }
+
/// Use [Self::get_syntax_for_file_name] instead
#[deprecated]
pub fn syntax_for_file_name(
@@ -167,7 +254,7 @@ impl HighlightingAssets {
mapping: &SyntaxMapping,
) -> Result<SyntaxReferenceInSet> {
if let Some(language) = language {
- let syntax_set = self.get_syntax_set()?;
+ let syntax_set = self.get_syntax_set_by_name(language)?;
syntax_set
.find_syntax_by_token(language)
.map(|syntax| SyntaxReferenceInSet { syntax, syntax_set })
@@ -320,6 +407,13 @@ pub(crate) fn get_integrated_themeset() -> ThemeSet {
from_binary(include_bytes!("../assets/themes.bin"), COMPRESS_THEMES)
}
+fn get_integrated_minimal_syntaxes() -> MinimalSyntaxes {
+ from_binary(
+ include_bytes!("../assets/minimal_syntaxes.bin"),
+ COMPRESS_MINIMAL_SYNTAXES,
+ )
+}
+
pub(crate) fn from_binary<T: serde::de::DeserializeOwned>(v: &[u8], compressed: bool) -> T {
asset_from_contents(v, "n/a", compressed)
.expect("data integrated in binary is never faulty, but make sure `compressed` is in sync!")