Skip to content

Commit 041f07e

Browse files
authored
Move verify flag into detectableChunk (#4558)
Chunk.Verify is an odd field - it originally conveys whether a source is going to run with verification, but then, at a certain point in the scanning pipeline, is mutated such that it instead indicates whether the chunk should be scanned with verification - which is not solely dependent on the source's verify flag. This is unnecessarily difficult to understand and maintain. This commit separates those two pieces of information into two flags: - Chunk.Verify has been renamed to Chunk.SourceVerify - It is no longer mutated; instead "should this chunk's secrets be verified?" is now captured by a new field on detectableChunk
1 parent e976603 commit 041f07e

File tree

25 files changed

+191
-149
lines changed

25 files changed

+191
-149
lines changed

pkg/decoders/escaped_unicode.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
111111
SecretID: chunk.SecretID,
112112
SourceMetadata: chunk.SourceMetadata,
113113
SourceType: chunk.SourceType,
114-
Verify: chunk.Verify,
114+
SourceVerify: chunk.SourceVerify,
115115
},
116116
}
117117
} else {

pkg/engine/engine.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,7 @@ type detectableChunk struct {
777777
chunk sources.Chunk
778778
decoder detectorspb.DecoderType
779779
wgDoneFn func()
780+
verify bool
780781
}
781782

782783
// verificationOverlapChunk is a decoded chunk that has multiple detectors that match it.
@@ -851,7 +852,7 @@ func (e *Engine) scannerWorker(ctx context.Context) {
851852

852853
for chunk := range e.ChunksChan() {
853854
startTime := time.Now()
854-
sourceVerify := chunk.Verify
855+
sourceVerify := chunk.SourceVerify
855856

856857
decoded := iterativeDecode(chunk, e.decoders, e.maxDecodeDepth)
857858

@@ -869,12 +870,12 @@ func (e *Engine) scannerWorker(ctx context.Context) {
869870
}
870871

871872
for _, detector := range matchingDetectors {
872-
d.Chunk.Verify = e.shouldVerifyChunk(sourceVerify, detector, e.detectorVerificationOverrides)
873873
wgDetect.Add(1)
874874
e.detectableChunksChan <- detectableChunk{
875875
chunk: *d.Chunk,
876876
detector: detector,
877877
decoder: d.DecoderType,
878+
verify: e.shouldVerifyChunk(sourceVerify, detector, e.detectorVerificationOverrides),
878879
wgDoneFn: wgDetect.Done,
879880
}
880881
}
@@ -1069,11 +1070,11 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
10691070

10701071
for _, detector := range detectorKeysWithResults {
10711072
wgDetect.Add(1)
1072-
chunk.chunk.Verify = e.shouldVerifyChunk(chunk.chunk.Verify, detector, e.detectorVerificationOverrides)
10731073
e.detectableChunksChan <- detectableChunk{
10741074
chunk: chunk.chunk,
10751075
detector: detector,
10761076
decoder: chunk.decoder,
1077+
verify: e.shouldVerifyChunk(chunk.chunk.SourceVerify, detector, e.detectorVerificationOverrides),
10771078
wgDoneFn: wgDetect.Done,
10781079
}
10791080
}
@@ -1136,7 +1137,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
11361137
results, err := e.verificationCache.FromData(
11371138
ctx,
11381139
data.detector.Detector,
1139-
data.chunk.Verify,
1140+
data.verify,
11401141
data.chunk.SecretID != 0,
11411142
matchBytes)
11421143
t.Stop()

pkg/engine/engine_test.go

Lines changed: 99 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,77 +1501,115 @@ func (p passthroughDecoder) FromChunk(chunk *sources.Chunk) *decoders.DecodableC
15011501

15021502
func (p passthroughDecoder) Type() detectorspb.DecoderType { return detectorspb.DecoderType(-1) }
15031503

1504+
// TestEngine_DetectChunk_UsesVerifyFlag validates that detectChunk correctly forwards detectableChunk.verify to
1505+
// detectors.
15041506
func TestEngine_DetectChunk_UsesVerifyFlag(t *testing.T) {
15051507
ctx := context.Background()
15061508

1507-
// Arrange: Create a minimal engine.
1508-
e := &Engine{
1509-
results: make(chan detectors.ResultWithMetadata, 1),
1510-
verificationCache: verificationcache.New(nil, &verificationcache.InMemoryMetrics{}),
1509+
testCases := []struct {
1510+
name string
1511+
verify bool
1512+
}{
1513+
{name: "verify=true", verify: true},
1514+
{name: "verify=false", verify: false},
15111515
}
15121516

1513-
// Arrange: Create a detector match. We can't create one directly, so we have to use a minimal A-H core.
1514-
ahcore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{passthroughDetector{keywords: []string{"keyword"}}})
1515-
detectorMatches := ahcore.FindDetectorMatches([]byte("keyword"))
1516-
require.Len(t, detectorMatches, 1)
1517+
for _, tc := range testCases {
1518+
t.Run(tc.name, func(t *testing.T) {
1519+
// Arrange: Create a minimal engine.
1520+
e := &Engine{
1521+
results: make(chan detectors.ResultWithMetadata, 1),
1522+
verificationCache: verificationcache.New(nil, &verificationcache.InMemoryMetrics{}),
1523+
}
15171524

1518-
// Arrange: Create a chunk to detect.
1519-
chunk := detectableChunk{
1520-
chunk: sources.Chunk{
1521-
Verify: true,
1522-
},
1523-
detector: detectorMatches[0],
1524-
wgDoneFn: func() {},
1525-
}
1525+
// Arrange: Create a detector match. We can't create one directly, so we have to use a minimal A-H core.
1526+
ahcore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{passthroughDetector{keywords: []string{"keyword"}}})
1527+
detectorMatches := ahcore.FindDetectorMatches([]byte("keyword"))
1528+
require.Len(t, detectorMatches, 1)
15261529

1527-
// Act
1528-
e.detectChunk(ctx, chunk)
1529-
close(e.results)
1530+
// Arrange: Create a chunk to detect.
1531+
chunk := detectableChunk{
1532+
detector: detectorMatches[0],
1533+
verify: tc.verify,
1534+
wgDoneFn: func() {},
1535+
}
15301536

1531-
// Assert: Confirm that a result was generated and that it has the expected verify flag.
1532-
select {
1533-
case result := <-e.results:
1534-
assert.True(t, result.Result.Verified)
1535-
default:
1536-
t.Errorf("expected a result but did not get one")
1537+
// Act
1538+
e.detectChunk(ctx, chunk)
1539+
close(e.results)
1540+
1541+
// Assert: Confirm that a result was generated and that it has the expected verify flag.
1542+
select {
1543+
case result := <-e.results:
1544+
assert.Equal(t, tc.verify, result.Result.Verified)
1545+
default:
1546+
t.Errorf("expected a result but did not get one")
1547+
}
1548+
})
15371549
}
15381550
}
15391551

1552+
// TestEngine_ScannerWorker_DetectableChunkHasCorrectVerifyFlag validates that scannerWorker generates detectableChunk
1553+
// structs that have the correct verify flag set. It also validates that the original chunks' SourceVerify flags are
1554+
// unchanged.
15401555
func TestEngine_ScannerWorker_DetectableChunkHasCorrectVerifyFlag(t *testing.T) {
15411556
ctx := context.Background()
15421557

1543-
// Arrange: Create a minimal engine.
1544-
detector := &passthroughDetector{keywords: []string{"keyword"}}
1545-
e := &Engine{
1546-
AhoCorasickCore: ahocorasick.NewAhoCorasickCore([]detectors.Detector{detector}),
1547-
decoders: []decoders.Decoder{passthroughDecoder{}},
1548-
detectableChunksChan: make(chan detectableChunk, 1),
1549-
sourceManager: sources.NewManager(),
1550-
verify: true,
1551-
maxDecodeDepth: 1,
1558+
testCases := []struct {
1559+
name string
1560+
engineVerify bool
1561+
sourceVerify bool
1562+
wantVerify bool
1563+
}{
1564+
{name: "engineVerify=false,sourceVerify=false", engineVerify: false, sourceVerify: false, wantVerify: false},
1565+
{name: "engineVerify=false,sourceVerify=true", engineVerify: false, sourceVerify: true, wantVerify: false},
1566+
{name: "engineVerify=true,sourceVerify=false", engineVerify: true, sourceVerify: false, wantVerify: false},
1567+
{name: "engineVerify=true,sourceVerify=true", engineVerify: true, sourceVerify: true, wantVerify: true},
15521568
}
15531569

1554-
// Arrange: Create a chunk to scan.
1555-
chunk := sources.Chunk{
1556-
Data: []byte("keyword"),
1557-
Verify: true,
1558-
}
1570+
for _, tc := range testCases {
1571+
t.Run(tc.name, func(t *testing.T) {
1572+
// Arrange: Create a minimal engine.
1573+
detector := &passthroughDetector{keywords: []string{"keyword"}}
1574+
e := &Engine{
1575+
AhoCorasickCore: ahocorasick.NewAhoCorasickCore([]detectors.Detector{detector}),
1576+
decoders: []decoders.Decoder{passthroughDecoder{}},
1577+
detectableChunksChan: make(chan detectableChunk, 1),
1578+
sourceManager: sources.NewManager(),
1579+
verify: tc.engineVerify,
1580+
maxDecodeDepth: 1,
1581+
}
15591582

1560-
// Arrange: Enqueue a chunk to be scanned.
1561-
e.sourceManager.ScanChunk(&chunk)
1583+
// Arrange: Create a chunk to scan.
1584+
chunk := sources.Chunk{
1585+
Data: []byte("keyword"),
1586+
SourceVerify: tc.sourceVerify,
1587+
}
15621588

1563-
// Act
1564-
go e.scannerWorker(ctx)
1589+
// Arrange: Enqueue a chunk to be scanned.
1590+
e.sourceManager.ScanChunk(&chunk)
15651591

1566-
// Assert: Confirm that a chunk was generated and that it has the expected verify flag.
1567-
select {
1568-
case chunk := <-e.detectableChunksChan:
1569-
assert.True(t, chunk.chunk.Verify)
1570-
case <-time.After(1 * time.Second):
1571-
t.Errorf("expected a detectableChunk but did not get one")
1592+
// Act
1593+
go e.scannerWorker(ctx)
1594+
1595+
// Assert: Confirm that a chunk was generated, that its SourceVerify flag is unchanged, and that its verify
1596+
// flag is correctly set.
1597+
select {
1598+
case chunk := <-e.detectableChunksChan:
1599+
assert.Equal(t, tc.sourceVerify, chunk.chunk.SourceVerify)
1600+
assert.Equal(t, tc.wantVerify, chunk.verify)
1601+
case <-time.After(1 * time.Second):
1602+
t.Errorf("expected a detectableChunk but did not get one")
1603+
}
1604+
})
15721605
}
15731606
}
15741607

1608+
// TestEngine_VerificationOverlapWorker_DetectableChunkHasCorrectVerifyFlag validates that the results directly
1609+
// generated by verificationOverlapWorker all came from detector invocations with the verify flag cleared (because these
1610+
// results were generated from verification overlaps). It also validates that detectableChunk structs generated by
1611+
// verificationOverlapWorker have their verify flags correctly set, and that these structs' original chunks'
1612+
// SourceVerify flags are unchanged.
15751613
func TestEngine_VerificationOverlapWorker_DetectableChunkHasCorrectVerifyFlag(t *testing.T) {
15761614
ctx := context.Background()
15771615

@@ -1597,8 +1635,8 @@ func TestEngine_VerificationOverlapWorker_DetectableChunkHasCorrectVerifyFlag(t
15971635

15981636
// Arrange: Create a chunk to "scan."
15991637
chunk := sources.Chunk{
1600-
Data: []byte("keyword ;oahpow8heg;blaisd"),
1601-
Verify: true,
1638+
Data: []byte("keyword ;oahpow8heg;blaisd"),
1639+
SourceVerify: true,
16021640
}
16031641

16041642
// Arrange: Create overlapping detector matches. We can't create them directly, so we have to use a minimal A-H
@@ -1628,11 +1666,13 @@ func TestEngine_VerificationOverlapWorker_DetectableChunkHasCorrectVerifyFlag(t
16281666
assert.False(t, result.Result.Verified)
16291667
}
16301668

1631-
// Assert: Confirm that every generated detectable chunk carries the original Verify flag.
1669+
// Assert: Confirm that every generated detectable chunk's Chunk.SourceVerify flag is unchanged and that its
1670+
// verify flag is correctly set.
16321671
// CMR: There should be not be any of these chunks. However, due to what I believe is an unrelated bug, there
1633-
// are. This test ensures that even in that erroneous case, their Verify flag is correct.
1672+
// are. This test ensures that even in that erroneous case, their contents are correct.
16341673
for detectableChunk := range processedDetectableChunks {
1635-
assert.True(t, detectableChunk.chunk.Verify)
1674+
assert.True(t, detectableChunk.verify)
1675+
assert.True(t, detectableChunk.chunk.SourceVerify)
16361676
}
16371677
})
16381678
t.Run("no overlap", func(t *testing.T) {
@@ -1656,8 +1696,8 @@ func TestEngine_VerificationOverlapWorker_DetectableChunkHasCorrectVerifyFlag(t
16561696

16571697
// Arrange: Create a chunk to "scan."
16581698
chunk := sources.Chunk{
1659-
Data: []byte("keyword ;oahpow8heg;blaisd"),
1660-
Verify: true,
1699+
Data: []byte("keyword ;oahpow8heg;blaisd"),
1700+
SourceVerify: true,
16611701
}
16621702

16631703
// Arrange: Create non-overlapping detector matches. We can't create them directly, so we have to use a minimal
@@ -1681,9 +1721,10 @@ func TestEngine_VerificationOverlapWorker_DetectableChunkHasCorrectVerifyFlag(t
16811721
close(e.detectableChunksChan)
16821722
close(processedDetectableChunks)
16831723

1684-
// Assert: Confirm that every generated detectable chunk carries the original Verify flag.
1724+
// Assert: Confirm that SourceVerify flags are unchanged, and verify flags are correctly set.
16851725
for detectableChunk := range processedDetectableChunks {
1686-
assert.True(t, detectableChunk.chunk.Verify)
1726+
assert.True(t, detectableChunk.chunk.SourceVerify)
1727+
assert.True(t, detectableChunk.verify)
16871728
}
16881729
})
16891730
}

pkg/sources/circleci/circleci.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ func (s *Source) chunk(ctx context.Context, proj project, buildNum BuildNum, ste
375375
},
376376
},
377377
},
378-
Verify: s.verify,
378+
SourceVerify: s.verify,
379379
}
380380
if err := data.Error(); err != nil {
381381
return err

pkg/sources/docker/docker.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -351,8 +351,8 @@ func (s *Source) processHistoryEntry(ctx context.Context, historyInfo historyEnt
351351
},
352352
},
353353
},
354-
Verify: s.verify,
355-
Data: []byte(historyInfo.entry.CreatedBy),
354+
SourceVerify: s.verify,
355+
Data: []byte(historyInfo.entry.CreatedBy),
356356
}
357357

358358
ctx.Logger().V(2).Info("scanning image history entry", "index", historyInfo.index, "layer", historyInfo.layerDigest)
@@ -464,7 +464,7 @@ func (s *Source) processChunk(ctx context.Context, info chunkProcessingInfo, chu
464464
},
465465
},
466466
},
467-
Verify: s.verify,
467+
SourceVerify: s.verify,
468468
}
469469
chunk.Data = data.Bytes()
470470

pkg/sources/elasticsearch/elasticsearch.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ func (s *Source) Chunks(
202202
},
203203
},
204204
},
205-
Verify: s.verify,
205+
SourceVerify: s.verify,
206206
}
207207

208208
chunk.Data = []byte(document.message)

pkg/sources/filesystem/filesystem.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ func (s *Source) scanFile(ctx context.Context, path string, chunksChan chan *sou
346346
},
347347
},
348348
},
349-
Verify: s.verify,
349+
SourceVerify: s.verify,
350350
}
351351

352352
return handlers.HandleFile(fileCtx, inputFile, chunkSkel, sources.ChanReporter{Ch: chunksChan})

pkg/sources/gcs/gcs.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,11 +333,11 @@ func (s *Source) completeProgress(ctx context.Context) {
333333

334334
func (s *Source) processObject(ctx context.Context, o object) error {
335335
chunkSkel := &sources.Chunk{
336-
SourceName: s.name,
337-
SourceType: s.Type(),
338-
JobID: s.JobID(),
339-
SourceID: s.sourceId,
340-
Verify: s.verify,
336+
SourceName: s.name,
337+
SourceType: s.Type(),
338+
JobID: s.JobID(),
339+
SourceID: s.sourceId,
340+
SourceVerify: s.verify,
341341
SourceMetadata: &source_metadatapb.MetaData{
342342
Data: &source_metadatapb.MetaData_Gcs{
343343
Gcs: &source_metadatapb.GCS{

pkg/sources/gcs/gcs_integration_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,10 @@ func TestChunks_PublicBucket(t *testing.T) {
7979

8080
want := []*sources.Chunk{
8181
{
82-
SourceName: "test",
83-
SourceType: sourcespb.SourceType_SOURCE_TYPE_GCS,
84-
SourceID: 0,
85-
Verify: true,
82+
SourceName: "test",
83+
SourceType: sourcespb.SourceType_SOURCE_TYPE_GCS,
84+
SourceID: 0,
85+
SourceVerify: true,
8686
SourceMetadata: &source_metadatapb.MetaData{
8787
Data: &source_metadatapb.MetaData_Gcs{
8888
Gcs: &source_metadatapb.GCS{
@@ -164,10 +164,10 @@ func createTestChunks() []*sources.Chunk {
164164
chunks := make([]*sources.Chunk, 0, len(objects))
165165
for _, o := range objects {
166166
chunks = append(chunks, &sources.Chunk{
167-
SourceName: "test",
168-
SourceType: sourcespb.SourceType_SOURCE_TYPE_GCS,
169-
SourceID: 0,
170-
Verify: true,
167+
SourceName: "test",
168+
SourceType: sourcespb.SourceType_SOURCE_TYPE_GCS,
169+
SourceID: 0,
170+
SourceVerify: true,
171171
SourceMetadata: &source_metadatapb.MetaData{
172172
Data: &source_metadatapb.MetaData_Gcs{
173173
Gcs: &source_metadatapb.GCS{

pkg/sources/gcs/gcs_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -271,11 +271,11 @@ func createTestObject(id int) object {
271271

272272
func createTestSourceChunk(id int) *sources.Chunk {
273273
return &sources.Chunk{
274-
SourceName: "test",
275-
SourceType: sourcespb.SourceType_SOURCE_TYPE_GCS,
276-
SourceID: 0,
277-
Verify: true,
278-
Data: []byte(fmt.Sprintf("hello world %d", id)),
274+
SourceName: "test",
275+
SourceType: sourcespb.SourceType_SOURCE_TYPE_GCS,
276+
SourceID: 0,
277+
SourceVerify: true,
278+
Data: []byte(fmt.Sprintf("hello world %d", id)),
279279
SourceMetadata: &source_metadatapb.MetaData{
280280
Data: &source_metadatapb.MetaData_Gcs{
281281
Gcs: &source_metadatapb.GCS{

0 commit comments

Comments
 (0)