summaryrefslogtreecommitdiffstats
path: root/filetree/efficiency.go
blob: e383b9dd14c216a122002bcffe171aa3f037aa8b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package filetree

import (
	"fmt"
	"github.com/sirupsen/logrus"
	"sort"
)

// EfficiencyData represents the storage and reference statistics for a given file tree path.
type EfficiencyData struct {
	Path              string
	Nodes             []*FileNode
	CumulativeSize    int64
	minDiscoveredSize int64
}

// EfficiencySlice represents an ordered set of EfficiencyData data structures.
type EfficiencySlice []*EfficiencyData

// Len is required for sorting.
func (efs EfficiencySlice) Len() int {
	return len(efs)
}

// Swap operation is required for sorting.
func (efs EfficiencySlice) Swap(i, j int) {
	efs[i], efs[j] = efs[j], efs[i]
}

// Less comparison is required for sorting.
func (efs EfficiencySlice) Less(i, j int) bool {
	return efs[i].CumulativeSize < efs[j].CumulativeSize
}

// Efficiency returns the score and file set of the given set of FileTrees (layers). This is loosely based on:
// 1. Files that are duplicated across layers discounts your score, weighted by file size
// 2. Files that are removed discounts your score, weighted by the original file size
func Efficiency(trees []*FileTree) (float64, EfficiencySlice) {
	efficiencyMap := make(map[string]*EfficiencyData)
	inefficientMatches := make(EfficiencySlice, 0)
	currentTree := 0

	visitor := func(node *FileNode) error {
		path := node.Path()
		if _, ok := efficiencyMap[path]; !ok {
			efficiencyMap[path] = &EfficiencyData{
				Path:              path,
				Nodes:             make([]*FileNode, 0),
				minDiscoveredSize: -1,
			}
		}
		data := efficiencyMap[path]

		// this node may have had children that were deleted, however, we won't explicitly list out every child, only
		// the top-most parent with the cumulative size. These operations will need to be done on the full (stacked)
		// tree.
		// Note: whiteout files may also represent directories, so we need to find out if this was previously a file or dir.
		var sizeBytes int64

		if node.IsWhiteout() {
			sizer := func(curNode *FileNode) error {
				sizeBytes += curNode.Data.FileInfo.TarHeader.FileInfo().Size()
				return nil
			}
			stackedTree := StackRange(trees, 0, currentTree-1)
			previousTreeNode, err := stackedTree.GetNode(node.Path())
			if err != nil {
				logrus.Debug(fmt.Sprintf("CurrentTree: %d : %s", currentTree, err))
			} else if previousTreeNode.Data.FileInfo.TarHeader.FileInfo().IsDir() {
				previousTreeNode.VisitDepthChildFirst(sizer, nil)
			}

		} else {
			sizeBytes = node.Data.FileInfo.TarHeader.FileInfo().Size()
		}

		data.CumulativeSize += sizeBytes
		if data.minDiscoveredSize < 0 || sizeBytes < data.minDiscoveredSize {
			data.minDiscoveredSize = sizeBytes
		}
		data.Nodes = append(data.Nodes, node)

		if len(data.Nodes) == 2 {
			inefficientMatches = append(inefficientMatches, data)
		}

		return nil
	}
	visitEvaluator := func(node *FileNode) bool {
		return node.IsLeaf()
	}
	for idx, tree := range trees {
		currentTree = idx
		tree.VisitDepthChildFirst(visitor, visitEvaluator)
	}

	// calculate the score
	var minimumPathSizes int64
	var discoveredPathSizes int64

	for _, value := range efficiencyMap {
		minimumPathSizes += value.minDiscoveredSize
		discoveredPathSizes += value.CumulativeSize
	}
	score := float64(minimumPathSizes) / float64(discoveredPathSizes)

	sort.Sort(inefficientMatches)

	return score, inefficientMatches
}