summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCharles Duffy <charles@dyfis.net>2018-10-14 00:40:37 -0500
committerCharles Duffy <charles@dyfis.net>2018-11-29 20:27:44 -0600
commit7df55477fd04b1443051f767fd77ea42cb50ed59 (patch)
treeb7d9d90e9ece1e2af48425e2129bbed53d50866d
parent6845ebbff1c4082c5d4452dcb0e11e61d143e841 (diff)
bees: init at 0.6.1
Introduce an extent-layer (as opposed to the existing file-level) deduplication system for btrfs. This provides a means of finding similarities within non-identical files, when they contain identical, aligned blocks.
-rwxr-xr-xpkgs/tools/filesystems/bees/bees-service-wrapper223
-rw-r--r--pkgs/tools/filesystems/bees/default.nix69
-rw-r--r--pkgs/top-level/all-packages.nix2
3 files changed, 294 insertions, 0 deletions
diff --git a/pkgs/tools/filesystems/bees/bees-service-wrapper b/pkgs/tools/filesystems/bees/bees-service-wrapper
new file mode 100755
index 000000000000..8ef46afc18f5
--- /dev/null
+++ b/pkgs/tools/filesystems/bees/bees-service-wrapper
@@ -0,0 +1,223 @@
+#!@bash@/bin/bash
+PATH=@bash@/bin:@coreutils@/bin:@utillinux@/bin:@btrfsProgs@/bin:$PATH
+beesd_bin=@bees@/lib/bees/bees
+# PLEASE KEEP NIX-ISMS ABOVE THIS LINE TO EASE UPSTREAM MERGE
+#!/usr/bin/env bash
+
+shopt -s extglob
+
+# Upstream wrapper requires UUID to be used for configuration.
+
+# However, when declaratively describing a host, we may not know its UUID, and
+# shouldn't need to persist something that will differ between hosts built from
+# the same configuration template.
+
+# Thus, for using bees from NixOS, we have our own wrapper, which supports not
+# just UUID but any specification permitted by findmnt
+
+[[ $bees_debug ]] && { PS4=':${BASH_SOURCE##*/}:$LINENO+'; set -x; }
+
+usage() {
+ cat >&2 <<EOF
+Usage: ${BASH_SOURCE##*/} run|cleanup config-name|fsSpec [idxSizeMB=...] [verbosity=...] [workDir=...] [-- daemon-options...]
+
+ fsSpec should be in a format recognized by findmnt. Alternately,
+ "config-name" may refer to a file that exists in ${bees_config_dir:-/etc/bees}
+ with a .conf extension; if that file does not specify UUID, findmnt will be
+ used in addition.
+
+ Note that while config files may presently use shell arithmetic, use of this
+ functionality is not encouraged going forward: Setting ''idxSizeMB=4096'' is
+ preferred over ''DB_SIZE=$((1024*1024*1024*4))'' or ''DB_SIZE=$(( AL16M * 256 ))'',
+ although both of these are presently supported.
+
+ If fsSpec contains a /, it assumed to be a mount point to be looked up by
+ findmnt, not a config file name.
+
+ daemon-options are passed directly through to the daemon on startup, as
+ documented at https://github.com/Zygo/bees/blob/master/docs/options.md.
+EOF
+ exit 1
+}
+
+die() { echo "$*" >&2; exit 1; }
+
+allConfigNames=( blockdev fsSpec home idxSize idxSizeMB mntDir runDir status verbosity workDir )
+
+# Alternate names for configuration values; "bees_" will always be prepended
+declare -A altConfigNames=(
+ # from original bees wrapper
+ [BEESHOME]=home
+ [BEESSTATUS]=status
+ [MNT_DIR]=mntDir
+ [UUID]=uuid
+ [WORK_DIR]=runDir
+ [DB_SIZE]=idxSize
+)
+
+# legacy bees config files can be arbitrary shell scripts, so we need to actually evaluate them
+sandboxedConfigFileEval() {
+ bash_exe=$(type -P bash) || exit
+ PATH=/var/empty ENV='' BASH_ENV='' AL128K="$((128*1024))" AL16M="$((16*1024*1024))" "$bash_exe" -r ${bees_debug+-x} \
+ -c 'eval "$(</dev/stdin)" >&2; for var; do [[ ${!var} ]] && printf "%q=%s\\0" "$var" "${!var}"; done' \
+ "${!altConfigNames[@]}" "${allConfigNames[@]}" \
+ <"$1"
+}
+
+readConfigFileIfExists() {
+ local line
+ [[ -s $1 ]] || return 1
+ while IFS= read -r -d '' line; do
+ line=${line%%+([[:space:]])"#"*}
+ [[ $line ]] || continue
+ [[ $line = *=* ]] || {
+ printf 'WARNING: Config file line not recognized: %q\n' "$line" >&2
+ continue
+ }
+ set_option "$line"
+ done < <(sandboxedConfigFileEval "$1")
+}
+
+set_option() {
+ local k v
+ k="${1%%=*}" v="${1#*=}"
+ [[ ${altConfigNames[$k]} ]] && k=${altConfigNames[$k]}
+ printf -v "bees_$k" %s "$v"
+}
+
+uuid_re='^[[:xdigit:]]{8}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{12}$'
+
+# Shared code for setting configuration used by other operations.
+#
+# Reads from global associative array "opts" containing options passed in as
+# key=value pairs on the command line, looks for config-file overrides, and
+# sets individual global variables.
+_setup() {
+ declare fstype
+ bees_fsSpec=$1; shift
+
+ # Look for file-based configuration, additional to honoring configuration on the command line
+ bees_config_dir="${bees_config_dir:-/etc/bees}"
+ if [[ $bees_fsSpec =~ $uuid_re ]]; then
+ bees_uuid=$bees_fsSpec
+ # If our spec looks like a bare UUID, and no config file exists in the new
+ # format, fall back to legacy config file search mechanism (grep; ewww).
+ if ! readConfigFileIfExists "$bees_config_dir/UUID=$bees_fsSpec.conf"; then
+ # Legacy approach to finding a config file: Grep for a *.conf file
+ # containing the UUID within its text. Permitting spaces around the "="
+ # appears to be a bug, but is retained for compatibility with the
+ # original upstream script.
+ allConfFiles=( "$bees_config_dir"/*.conf )
+ if (( ${#allConfFiles[@]} )); then
+ # in read or readarray with -d '', the NUL terminating the empty string is used as delimiter character.
+ readarray -d '' -t matchingConfFiles < <(grep -E -l -Z "^[^#]*UUID[[:space:]]*=[[:space:]]*" "${allConfFiles[@]}")
+ else
+ matchingConfFiles=( )
+ fi
+ if (( ${#matchingConfFiles[@]} == 1 )); then
+ # Exactly one configuration file exists in our target directory with a reference to the UUID given.
+ bees_config_file=${matchingConfFiles[0]}
+ readConfigFileIfExists "$bees_config_file"
+ echo "NOTE: Please consider renaming $bees_config_file to $bees_config_dir/UUID=$bees_fsSpec" >&2
+ echo " ...and passing UUID=$bees_fsSpec on startup." >&2
+ elif (( ${#matchingConfFiles[@]} > 1 )); then
+ # The legacy wrapper would silently use the first file and ignore
+ # others, but... no.
+ echo "ERROR: Passed a bare UUID, but multiple configuration files match it:" >&2
+ printf ' - %q\n' "${matchingConfFiles[@]}" >&2
+ die "Unable to continue."
+ fi
+ fi
+ else
+ # For a non-UUID fsSpec that is not a path, look only for a config file
+ # exactly matching its text.
+ #
+ # (Passing a mount point as a fsSpec is only supported with the new
+ # wrapper; all key=value pairs can be passed on the command line in this
+ # mode, so config file support is not needed).
+ [[ $bees_fsSpec = */* ]] || readConfigFileIfExists "$bees_config_dir/$bees_fsSpec.conf"
+ fi
+
+ [[ $bees_uuid ]] || {
+ # if bees_uuid is not in our .conf file, look it up with findmnt
+ read -r bees_uuid fstype < <(findmnt -n -o uuid,fstype "$bees_fsSpec") && [[ $fstype ]] || exit
+ [[ $fstype = btrfs ]] || die "Device type is $fstype, not btrfs"
+ }
+
+ [[ $bees_uuid = */* ]] || readConfigFileIfExists "$bees_config_dir/UUID=$bees_uuid.conf"
+
+ # Honor any values read from config files above; otherwise, set defaults.
+ bees_workDir="${bees_workDir:-.beeshome}"
+ bees_runDir="${bees_runDir:-/run/bees}"
+ bees_mntDir="${bees_mntDir:-$bees_runDir/mnt/$bees_uuid}"
+ bees_home="${bees_home:-$bees_mntDir/$bees_workDir}"
+ bees_status="${bees_status:-${bees_runDir}/$bees_uuid.status}"
+ bees_verbosity="${bees_verbosity:-6}"
+ bees_idxSizeMB="${bees_idxSizeMB:-1024}"
+ bees_idxSize=${bees_idxSize:-"$(( bees_idxSizeMB * 1024 * 1024 ))"}
+ bees_blockdev=${bees_blockdev:-"/dev/disk/by-uuid/$bees_uuid"}
+
+ [[ -b $bees_blockdev ]] || die "Block device $bees_blockdev missing"
+ (( bees_idxSize % (16 * 1024 * 1024) == 0 )) || die "DB size must be divisible by 16MB"
+}
+
+do_run() {
+ local db old_db_size
+
+ _setup "$1"; shift
+ mkdir -p -- "$bees_mntDir" || exit
+
+ # subvol id 5 is reserved for the root subvolume of a btrfs filesystem.
+ mountpoint -q "$bees_mntDir" || mount -osubvolid=5 -- "$bees_blockdev" "$bees_mntDir" || exit
+ if [[ -d $bees_home ]]; then
+ btrfs subvolume show "$bees_home" >/dev/null 2>&1 || die "$bees_home exists but is not a subvolume"
+ else
+ btrfs subvolume create "$bees_home" || exit
+ sync # workaround for Zygo/bees#93
+ fi
+ db=$bees_home/beeshash.dat
+ touch -- "$db"
+
+ old_db_size=$(stat -c %s -- "$db")
+ new_db_size=$bees_idxSize
+
+ if (( old_db_size != new_db_size )); then
+ rm -f -- "$bees_home"/beescrawl."$bees_uuid".dat
+ truncate -s "$new_db_size" -- "$db" || exit
+ fi
+ chmod 700 -- "$bees_home"
+
+ # BEESSTATUS and BEESHOME are the only variables handled by the legacy
+ # wrapper for which getenv() is called in C code.
+ BEESSTATUS=$bees_status BEESHOME=$bees_home exec "${beesd_bin:-/lib/bees/bees}" \
+ --verbose "$bees_verbosity" \
+ "$@" "$bees_mntDir" || exit
+}
+
+do_cleanup() {
+ _setup "$1"; shift
+ mountpoint -q "$bees_mntDir" && umount -l -- "$bees_mntDir" || exit
+}
+
+(( $# >= 2 )) || usage
+declare -f "do_$1" >/dev/null 2>&1 || usage
+mode=$1; shift # must be a do_* function; currently "run" or "cleanup"
+
+declare -a args=( "$1" ); shift # pass first argument (config-name|fsSpec) through literally
+
+# parse other arguments as key=value pairs, or pass them through literally if they do not match that form.
+# similarly, any option after "--" will be passed through literally.
+while (( $# )); do
+ if [[ $1 = *=* ]]; then
+ set_option "$1"
+ elif [[ $1 = -- ]]; then
+ shift
+ args+=( "$@" )
+ break
+ else
+ args+=( "$1" )
+ fi
+ shift
+done
+
+"do_$mode" "${args[@]}"
diff --git a/pkgs/tools/filesystems/bees/default.nix b/pkgs/tools/filesystems/bees/default.nix
new file mode 100644
index 000000000000..c43962cb075d
--- /dev/null
+++ b/pkgs/tools/filesystems/bees/default.nix
@@ -0,0 +1,69 @@
+{ stdenv, runCommand, makeWrapper, fetchFromGitHub, bash, btrfs-progs, coreutils, pythonPackages, utillinux }:
+
+let
+
+ version = "0.6.1";
+ sha256 = "0h7idclmhyp14mq6786x7f2237vqpn70gyi88ik4g70xl84yfgyh";
+
+ bees = stdenv.mkDerivation rec {
+ name = "bees-${version}";
+ inherit version;
+
+ src = fetchFromGitHub {
+ owner = "Zygo";
+ repo = "bees";
+ rev = "v${version}";
+ inherit sha256;
+ };
+
+ buildInputs = [
+ btrfs-progs # for btrfs/ioctl.h
+ utillinux # for uuid.h
+ ];
+
+ nativeBuildInputs = [
+ pythonPackages.markdown # documentation build
+ ];
+
+ preBuild = ''
+ git() { if [[ $1 = describe ]]; then echo ${version}; else command git "$@"; fi; }
+ export -f git
+ '';
+
+ postBuild = ''
+ unset -f git
+ '';
+
+ buildFlags = [
+ "ETC_PREFIX=/var/run/bees/configs"
+ ];
+
+ makeFlags = [
+ "SHELL=bash"
+ "PREFIX=$(out)"
+ "ETC_PREFIX=$(out)/etc"
+ "BEES_VERSION=${version}"
+ "SYSTEMD_SYSTEM_UNIT_DIR=$(out)/etc/systemd/system"
+ ];
+
+ meta = with stdenv.lib; {
+ homepage = "https://github.com/Zygo/bees";
+ description = "Block-oriented BTRFS deduplication service";
+ license = licenses.gpl3;
+ platforms = platforms.linux;
+ maintainers = with maintainers; [ chaduffy ];
+ longDescription = "Best-Effort Extent-Same: bees finds not just identical files, but also identical extents within files that differ";
+ };
+ };
+
+in
+
+runCommand "bees-service-${version}" {
+ inherit bash bees coreutils utillinux;
+ btrfsProgs = btrfs-progs; # needs to be a valid shell variable name
+} ''
+ mkdir -p -- "$out/bin"
+ substituteAll ${./bees-service-wrapper} "$out"/bin/bees-service-wrapper
+ chmod +x "$out"/bin/bees-service-wrapper
+ ln -s ${bees}/bin/beesd "$out"/bin/beesd
+''
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 09f566ccd7fd..c7fe0a978c66 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -21893,6 +21893,8 @@ with pkgs;
beep = callPackage ../misc/beep { };
+ bees = callPackage ../tools/filesystems/bees { };
+
blackbird = callPackage ../misc/themes/blackbird { };
bootil = callPackage ../development/libraries/bootil { };