forked from managedcode/graphrag
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMarkdownTextChunkerBenchmarks.cs
More file actions
84 lines (71 loc) · 3.6 KB
/
MarkdownTextChunkerBenchmarks.cs
File metadata and controls
84 lines (71 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
using BenchmarkDotNet.Attributes;
using GraphRag.Chunking;
using GraphRag.Config;
namespace ManagedCode.GraphRag.Benchmarks.Chunking;
[MemoryDiagnoser]
public class MarkdownTextChunkerBenchmarks
{
private MarkdownTextChunker _chunker = null!;
private ChunkSlice[] _smallDocument = null!;
private ChunkSlice[] _mediumDocument = null!;
private ChunkSlice[] _largeDocument = null!;
private ChunkingConfig _config = null!;
[Params(512, 1024, 2048)]
public int ChunkSize { get; set; }
[Params(0, 64, 128)]
public int ChunkOverlap { get; set; }
[GlobalSetup]
public void Setup()
{
_chunker = new MarkdownTextChunker();
_config = new ChunkingConfig
{
Size = ChunkSize,
Overlap = ChunkOverlap,
Strategy = ChunkStrategyType.Sentence
};
// Generate test documents of different sizes
_smallDocument = new[] { new ChunkSlice("doc1", GenerateMarkdownDocument(1_000)) };
_mediumDocument = new[] { new ChunkSlice("doc1", GenerateMarkdownDocument(100_000)) };
_largeDocument = new[] { new ChunkSlice("doc1", GenerateMarkdownDocument(1_000_000)) };
}
[Benchmark]
public IReadOnlyList<TextChunk> ChunkSmallDocument()
{
return _chunker.Chunk(_smallDocument, _config);
}
[Benchmark]
public IReadOnlyList<TextChunk> ChunkMediumDocument()
{
return _chunker.Chunk(_mediumDocument, _config);
}
[Benchmark]
public IReadOnlyList<TextChunk> ChunkLargeDocument()
{
return _chunker.Chunk(_largeDocument, _config);
}
private static string GenerateMarkdownDocument(int approximateLength)
{
var paragraphs = new[]
{
"# Introduction\n\nThis is a sample markdown document for benchmarking purposes. It contains various markdown elements including headers, paragraphs, lists, and code blocks.\n\n",
"## Section One\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.\n\n",
"### Subsection A\n\nDuis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident.\n\n",
"- First item in the list\n- Second item with more content\n- Third item explaining something important\n\n",
"1. Numbered first item\n2. Numbered second item\n3. Numbered third item with explanation\n\n",
"```csharp\npublic class Example\n{\n public void Method() { }\n}\n```\n\n",
"## Section Two\n\nSunt in culpa qui officia deserunt mollit anim id est laborum. Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium.\n\n",
"> This is a blockquote that spans multiple lines and contains important information that should be preserved during chunking.\n\n",
"### Subsection B\n\nNemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt.\n\n",
"| Column 1 | Column 2 | Column 3 |\n|----------|----------|----------|\n| Data 1 | Data 2 | Data 3 |\n| Data 4 | Data 5 | Data 6 |\n\n"
};
var result = new System.Text.StringBuilder(approximateLength + 1000);
var index = 0;
while (result.Length < approximateLength)
{
result.Append(paragraphs[index % paragraphs.Length]);
index++;
}
return result.ToString();
}
}