summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Waldmann <tw@waldmann-edv.de>2024-11-15 10:42:07 +0100
committerThomas Waldmann <tw@waldmann-edv.de>2024-11-15 10:42:07 +0100
commitf7dea6e93fb144cc28e56657df4a3906476d7c8d (patch)
tree629332b4cb0755ecb325e68215da923936ce8fab
parentb6ae924f30d7e7320b03681974f4662f9afd65df (diff)
repo-compress: reduce memory consumption
Worst (but frequent) case here is that all or most of the chunks in the repo need to get recompressed, thus storing all chunk ids in a python list would need significant amounts of memory for large repositories. We already have all chunk ids stored in cache.chunks, so we now just flag the ones needing re-compression by setting the F_COMPRESS flag (that does not need any additional memory).
-rw-r--r--src/borg/archiver/repo_compress_cmd.py30
-rw-r--r--src/borg/hashindex.pyi1
-rw-r--r--src/borg/hashindex.pyx1
3 files changed, 19 insertions, 13 deletions
diff --git a/src/borg/archiver/repo_compress_cmd.py b/src/borg/archiver/repo_compress_cmd.py
index b1f07a6cf..f67e1b045 100644
--- a/src/borg/archiver/repo_compress_cmd.py
+++ b/src/borg/archiver/repo_compress_cmd.py
@@ -4,6 +4,7 @@ from collections import defaultdict
from ._common import with_repository, Highlander
from ..constants import * # NOQA
from ..compress import CompressionSpec, ObfuscateSize, Auto, COMPRESSOR_TABLE
+from ..hashindex import ChunkIndex
from ..helpers import sig_int, ProgressIndicatorPercent, Error
from ..repository import Repository
from ..remote import RemoteRepository
@@ -15,20 +16,22 @@ logger = create_logger()
def find_chunks(repository, repo_objs, cache, stats, ctype, clevel, olevel):
- """find chunks that need processing (usually: recompression)."""
- recompress_ids = []
+ """find and flag chunks that need processing (usually: recompression)."""
compr_keys = stats["compr_keys"] = set()
compr_wanted = ctype, clevel, olevel
- for id, _ in cache.chunks.iteritems():
+ recompress_count = 0
+ for id, cie in cache.chunks.iteritems():
chunk_no_data = repository.get(id, read_data=False)
meta = repo_objs.parse_meta(id, chunk_no_data, ro_type=ROBJ_DONTCARE)
compr_found = meta["ctype"], meta["clevel"], meta.get("olevel", -1)
if compr_found != compr_wanted:
- recompress_ids.append(id)
+ flags_compress = cie.flags | ChunkIndex.F_COMPRESS
+ cache.chunks[id] = cie._replace(flags=flags_compress)
+ recompress_count += 1
compr_keys.add(compr_found)
stats[compr_found] += 1
stats["checked_count"] += 1
- return recompress_ids
+ return recompress_count
def process_chunks(repository, repo_objs, stats, recompress_ids, olevel):
@@ -104,19 +107,20 @@ class RepoCompressMixIn:
stats_find = defaultdict(int)
stats_process = defaultdict(int)
- recompress_ids = find_chunks(repository, repo_objs, cache, stats_find, ctype, clevel, olevel)
- recompress_candidate_count = len(recompress_ids)
- chunks_limit = min(1000, max(100, recompress_candidate_count // 1000))
+ recompress_candidate_count = find_chunks(repository, repo_objs, cache, stats_find, ctype, clevel, olevel)
pi = ProgressIndicatorPercent(
- total=len(recompress_ids), msg="Recompressing %3.1f%%", step=0.1, msgid="repo_compress.process_chunks"
+ total=recompress_candidate_count,
+ msg="Recompressing %3.1f%%",
+ step=0.1,
+ msgid="repo_compress.process_chunks",
)
- while recompress_ids:
+ for id, cie in cache.chunks.iteritems():
if sig_int and sig_int.action_done():
break
- ids, recompress_ids = recompress_ids[:chunks_limit], recompress_ids[chunks_limit:]
- process_chunks(repository, repo_objs, stats_process, ids, olevel)
- pi.show(increase=len(ids))
+ if cie.flags & ChunkIndex.F_COMPRESS:
+ process_chunks(repository, repo_objs, stats_process, [id], olevel)
+ pi.show()
pi.finish()
if sig_int:
# Ctrl-C / SIGINT: do not commit
diff --git a/src/borg/hashindex.pyi b/src/borg/hashindex.pyi
index 994e54b5a..fb05aba86 100644
--- a/src/borg/hashindex.pyi
+++ b/src/borg/hashindex.pyi
@@ -13,6 +13,7 @@ CIE = Union[Tuple[int, int], Type[ChunkIndexEntry]]
class ChunkIndex:
F_NONE: int
F_USED: int
+ F_COMPRESS: int
F_NEW: int
M_USER: int
M_SYSTEM: int
diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx
index beca0540b..75b3b4a85 100644
--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@@ -47,6 +47,7 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
M_SYSTEM = 0xff000000 # mask for system flags
# user flags:
F_USED = 2 ** 0 # chunk is used/referenced
+ F_COMPRESS = 2 ** 1 # chunk shall get (re-)compressed
# system flags (internal use, always 0 to user, not changeable by user):
F_NEW = 2 ** 24 # a new chunk that is not present in repo/cache/chunks.* yet.