fix(ignore): ensure normalization of patterns and paths match (fixes #9597) (#9717)

In ignores, normalize the input when parsing it. When scanning, normalize earlier such that the path is already normalized when checking ignores. This requires splitting normalization of the string from normalization of the file, as we don't want to attempt the latter if the file is ignored. Closes #9598 --------- Co-authored-by: Jakob Borg <jakob@kastelo.net>
author: Simon Frei <freisim93@gmail.com> 2024-09-28 17:16:44 +0200
committer: GitHub <noreply@github.com> 2024-09-28 17:16:44 +0200
commit: 605fd6d726ec20c9e70f3a63e9e31995339f9a01 (patch)
tree: 56f4f2316dd30383f3bd9d30d5be5988eb457842
parent: 3c476542d26bce7ff7a454dceed9173a4aec40b6 (diff)
3 files changed, 59 insertions, 35 deletions
diff --git a/lib/ignore/ignore.go b/lib/ignore/ignore.go
index f0510d8254..c564a0a5f9 100644
--- a/lib/ignore/ignore.go
+++ b/lib/ignore/ignore.go
@@ -18,7 +18,9 @@ import (
 	"time"
 
 	"github.com/gobwas/glob"
+	"golang.org/x/text/unicode/norm"
 
+	"github.com/syncthing/syncthing/lib/build"
 	"github.com/syncthing/syncthing/lib/fs"
 	"github.com/syncthing/syncthing/lib/ignore/ignoreresult"
 	"github.com/syncthing/syncthing/lib/osutil"
@@ -212,6 +214,11 @@ func (m *Matcher) parseLocked(r io.Reader, file string) error {
 }
 
 // Match matches the patterns plus temporary and internal files.
+//
+// The "file" parameter must be in the OS' native unicode format (NFD on macos,
+// NFC everywhere else). This is always the case in real usage in syncthing, as
+// we ensure native unicode normalisation on all entry points (scanning and from
+// protocol) - so no need to normalize when calling this, except e.g. in tests.
 func (m *Matcher) Match(file string) (result ignoreresult.R) {
 	switch {
 	case fs.IsTemporary(file):
@@ -387,6 +394,10 @@ func loadParseIncludeFile(filesystem fs.Filesystem, file string, cd ChangeDetect
 }
 
 func parseLine(line string) ([]Pattern, error) {
+	// We use native normalization internally, thus the patterns must match
+	// that to avoid false negative matches.
+	line = nativeUnicodeNorm(line)
+
 	pattern := Pattern{
 		result: ignoreresult.Ignored,
 	}
@@ -464,6 +475,13 @@ func parseLine(line string) ([]Pattern, error) {
 	return patterns, nil
 }
 
+func nativeUnicodeNorm(s string) string {
+	if build.IsDarwin || build.IsIOS {
+		return norm.NFD.String(s)
+	}
+	return norm.NFC.String(s)
+}
+
 func parseIgnoreFile(fs fs.Filesystem, fd io.Reader, currentFile string, cd ChangeDetector, linesSeen map[string]struct{}) ([]string, []Pattern, error) {
 	var patterns []Pattern
 
diff --git a/lib/ignore/ignore_test.go b/lib/ignore/ignore_test.go
index 5d0a8b02cb..72a0c4db70 100644
--- a/lib/ignore/ignore_test.go
+++ b/lib/ignore/ignore_test.go
@@ -805,7 +805,10 @@ func TestIssue3174(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if !pats.Match("åäö").IsIgnored() {
+	// The pattern above is normalized when parsing, and in order for this
+	// string to match the pattern, it needs to use the same normalization. And
+	// Go always uses NFC regardless of OS, while we use NFD on macos.
+	if !pats.Match(nativeUnicodeNorm("åäö")).IsIgnored() {
 		t.Error("Should match")
 	}
 }
diff --git a/lib/scanner/walk.go b/lib/scanner/walk.go
index 3c34ca2854..7146f9ae06 100644
--- a/lib/scanner/walk.go
+++ b/lib/scanner/walk.go
@@ -293,6 +293,11 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 			return skip
 		}
 
+		// Just in case the filesystem doesn't produce the normalization the OS
+		// uses, and we use internally.
+		nonNormPath := path
+		path = normalizePath(path)
+
 		if m := w.Matcher.Match(path); m.IsIgnored() {
 			l.Debugln(w, "ignored (patterns):", path)
 			// Only descend if matcher says so and the current file is not a symlink.
@@ -319,9 +324,23 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 			return nil
 		}
 
+		if path != nonNormPath {
+			if !w.AutoNormalize {
+				// We're not authorized to do anything about it, so complain and skip.
+				handleError(ctx, "normalizing path", nonNormPath, errUTF8Normalization, finishedChan)
+				return skip
+			}
+
+			path, err = w.applyNormalization(nonNormPath, path, info)
+			if err != nil {
+				handleError(ctx, "normalizing path", nonNormPath, err, finishedChan)
+				return skip
+			}
+		}
+
 		if ignoredParent == "" {
 			// parent isn't ignored, nothing special
-			return w.handleItem(ctx, path, info, toHashChan, finishedChan, skip)
+			return w.handleItem(ctx, path, info, toHashChan, finishedChan)
 		}
 
 		// Part of current path below the ignored (potential) parent
@@ -330,7 +349,7 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 		// ignored path isn't actually a parent of the current path
 		if rel == path {
 			ignoredParent = ""
-			return w.handleItem(ctx, path, info, toHashChan, finishedChan, skip)
+			return w.handleItem(ctx, path, info, toHashChan, finishedChan)
 		}
 
 		// The previously ignored parent directories of the current, not
@@ -345,7 +364,7 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 				handleError(ctx, "scan", ignoredParent, err, finishedChan)
 				return skip
 			}
-			if err = w.handleItem(ctx, ignoredParent, info, toHashChan, finishedChan, skip); err != nil {
+			if err = w.handleItem(ctx, ignoredParent, info, toHashChan, finishedChan); err != nil {
 				return err
 			}
 		}
@@ -355,14 +374,7 @@ func (w *walker) walkAndHashFiles(ctx context.Context, toHashChan chan<- protoco
 	}
 }
 
-func (w *walker) handleItem(ctx context.Context, path string, info fs.FileInfo, toHashChan chan<- protocol.FileInfo, finishedChan chan<- ScanResult, skip error) error {
-	oldPath := path
-	path, err := w.normalizePath(path, info)
-	if err != nil {
-		handleError(ctx, "normalizing path", oldPath, err, finishedChan)
-		return skip
-	}
-
+func (w *walker) handleItem(ctx context.Context, path string, info fs.FileInfo, toHashChan chan<- protocol.FileInfo, finishedChan chan<- ScanResult) error {
 	switch {
 	case info.IsSymlink():
 		if err := w.walkSymlink(ctx, path, info, finishedChan); err != nil {
@@ -375,13 +387,13 @@ func (w *walker) handleItem(ctx context.Context, path string, info fs.FileInfo,
 		return nil
 
 	case info.IsDir():
-		err = w.walkDir(ctx, path, info, finishedChan)
+		return w.walkDir(ctx, path, info, finishedChan)
 
 	case info.IsRegular():
-		err = w.walkRegular(ctx, path, info, toHashChan)
+		return w.walkRegular(ctx, path, info, toHashChan)
 	}
 
-	return err
+	return fmt.Errorf("bug: file info for %v is neither symlink, dir nor regular", path)
 }
 
 func (w *walker) walkRegular(ctx context.Context, relPath string, info fs.FileInfo, toHashChan chan<- protocol.FileInfo) error {
@@ -550,30 +562,21 @@ func (w *walker) walkSymlink(ctx context.Context, relPath string, info fs.FileIn
 	return nil
 }
 
-// normalizePath returns the normalized relative path (possibly after fixing
-// it on disk), or skip is true.
-func (w *walker) normalizePath(path string, info fs.FileInfo) (normPath string, err error) {
+func normalizePath(path string) string {
 	if build.IsDarwin || build.IsIOS {
 		// Mac OS X file names should always be NFD normalized.
-		normPath = norm.NFD.String(path)
-	} else {
-		// Every other OS in the known universe uses NFC or just plain
-		// doesn't bother to define an encoding. In our case *we* do care,
-		// so we enforce NFC regardless.
-		normPath = norm.NFC.String(path)
-	}
-
-	if path == normPath {
-		// The file name is already normalized: nothing to do
-		return path, nil
-	}
-
-	if !w.AutoNormalize {
-		// We're not authorized to do anything about it, so complain and skip.
-
-		return "", errUTF8Normalization
+		return norm.NFD.String(path)
 	}
+	// Every other OS in the known universe uses NFC or just plain
+	// doesn't bother to define an encoding. In our case *we* do care,
+	// so we enforce NFC regardless.
+	return norm.NFC.String(path)
+}
 
+// applyNormalization fixes the normalization of the file on disk, i.e. ensures
+// the file at path ends up named normPath. It shouldn't but may happen that the
+// file ends up with a different name, in which case that one should be scanned.
+func (w *walker) applyNormalization(path, normPath string, info fs.FileInfo) (string, error) {
 	// We will attempt to normalize it.
 	normInfo, err := w.Filesystem.Lstat(normPath)
 	if fs.IsNotExist(err) {
author	Simon Frei <freisim93@gmail.com>	2024-09-28 17:16:44 +0200
committer	GitHub <noreply@github.com>	2024-09-28 17:16:44 +0200
commit	605fd6d726ec20c9e70f3a63e9e31995339f9a01 (patch)
tree	56f4f2316dd30383f3bd9d30d5be5988eb457842
parent	3c476542d26bce7ff7a454dceed9173a4aec40b6 (diff)