Skip to content

Commit a305ecc

Browse files
committed
options and metadata
1 parent e3dd6d6 commit a305ecc

20 files changed

Lines changed: 1347 additions & 285 deletions

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ If I tell you to remember something, you do the same, update
1111
## Rules to follow
1212
- MIME handling: always use `ManagedCode.MimeTypes` for MIME constants, lookups, and validation logic.
1313
- Treat this repository as a high-fidelity port of `microsoft-markitdown`: every test fixture copied from the upstream `tests/test_files/` directory must be referenced by .NET tests (either as positive conversions or explicit unsupported cases). No orphaned fixtures.
14+
- CSV parsing must use the `Sep` library; avoid Sylvan or other CSV parsers for new or updated code.
1415

1516
# Repository Guidelines
1617

README.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ Console.WriteLine(urlResult.Title);
377377
### Customise the pipeline with options
378378

379379
```csharp
380+
using System;
380381
using Azure;
381382
using MarkItDown;
382383

@@ -387,6 +388,14 @@ var options = new MarkItDownOptions
387388
await myCaptionService.DescribeAsync(bytes, info, token),
388389
AudioTranscriber = async (bytes, info, token) =>
389390
await speechClient.TranscribeAsync(bytes, info, token),
391+
Segments = new SegmentOptions
392+
{
393+
IncludeSegmentMetadataInMarkdown = true,
394+
Audio = new AudioSegmentOptions
395+
{
396+
SegmentDuration = TimeSpan.FromMinutes(2)
397+
}
398+
},
390399
DocumentIntelligence = new DocumentIntelligenceOptions
391400
{
392401
Endpoint = "https://<your-resource>.cognitiveservices.azure.com/",
@@ -395,6 +404,8 @@ var options = new MarkItDownOptions
395404
};
396405

397406
var markItDown = new MarkItDown(options);
407+
408+
// Segments are still available programmatically even when annotations are disabled.
398409
```
399410

400411
### Custom converters
@@ -682,7 +693,7 @@ public class DocumentConversionFunction
682693

683694
- **`MarkItDown`** - Main entry point for conversions
684695
- **`IDocumentConverter`** - Interface for format-specific converters
685-
- **`DocumentConverterResult`** - Contains the converted Markdown and optional metadata
696+
- **`DocumentConverterResult`** - Contains the aggregate Markdown plus structured `DocumentSegment` entries
686697
- **`StreamInfo`** - Metadata about the input stream (MIME type, extension, charset, etc.)
687698
- **`ConverterRegistration`** - Associates converters with priority for selection
688699

@@ -710,6 +721,26 @@ MarkItDown includes these converters in priority order:
710721
- **`ImageConverter`** - Image metadata via ExifTool and optional captions
711722
- **`PlainTextConverter`** - Plain text, Markdown, and other text formats (fallback)
712723

724+
### Structured Segments & Metadata
725+
726+
Every conversion populates `DocumentConverterResult.Segments` with strongly typed `DocumentSegment` instances. Segments preserve natural breakpoints (pages, slides, sheets, archive entries, audio ranges) alongside rich metadata:
727+
728+
- `Type` and `Number` expose what the segment represents (for example page/slide numbers)
729+
- `Label` carries human-readable descriptors when available
730+
- `StartTime`/`EndTime` capture media timelines for audio/video content
731+
- `AdditionalMetadata` holds contextual properties such as archive entry paths or sheet names
732+
733+
```csharp
734+
var result = await markItDown.ConvertAsync("report.pdf");
735+
736+
foreach (var segment in result.Segments)
737+
{
738+
Console.WriteLine($"[{segment.Type}] #{segment.Number}: {segment.Label}");
739+
}
740+
```
741+
742+
Runtime behaviour is controlled through `SegmentOptions` on `MarkItDownOptions`. Enabling `IncludeSegmentMetadataInMarkdown` emits inline annotations like `[page:1]`, `[sheet:Sales]`, or `[timecode:00:01:00-00:02:00]` directly in the Markdown stream. Audio transcripts honour `Segments.Audio.SegmentDuration`, while still collapsing short transcripts into a single, time-aware slice.
743+
713744
### Converter Priority & Detection
714745

715746
- Priority-based dispatch (lower values processed first)

src/MarkItDown/Converters/AudioConverter.cs

Lines changed: 247 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
using System;
12
using System.Collections.Generic;
3+
using System.Globalization;
4+
using System.IO;
25
using System.Text;
36
using System.Threading;
47
using System.Threading.Tasks;
@@ -44,17 +47,24 @@ public sealed class AudioConverter : IDocumentConverter
4447

4548
private readonly IAudioMetadataExtractor metadataExtractor;
4649
private readonly IAudioTranscriber transcriber;
47-
48-
public AudioConverter(string? exifToolPath = null, Func<byte[], StreamInfo, CancellationToken, Task<string?>>? transcribeAsync = null)
49-
: this(new ExifToolAudioMetadataExtractor(exifToolPath),
50-
transcribeAsync is null ? NoOpAudioTranscriber.Instance : new DelegateAudioTranscriber(transcribeAsync))
50+
private readonly SegmentOptions segmentOptions;
51+
52+
public AudioConverter(
53+
string? exifToolPath = null,
54+
Func<byte[], StreamInfo, CancellationToken, Task<string?>>? transcribeAsync = null,
55+
SegmentOptions? segmentOptions = null)
56+
: this(
57+
new ExifToolAudioMetadataExtractor(exifToolPath),
58+
transcribeAsync is null ? NoOpAudioTranscriber.Instance : new DelegateAudioTranscriber(transcribeAsync),
59+
segmentOptions)
5160
{
5261
}
5362

54-
internal AudioConverter(IAudioMetadataExtractor metadataExtractor, IAudioTranscriber transcriber)
63+
internal AudioConverter(IAudioMetadataExtractor metadataExtractor, IAudioTranscriber transcriber, SegmentOptions? segmentOptions = null)
5564
{
5665
this.metadataExtractor = metadataExtractor ?? throw new ArgumentNullException(nameof(metadataExtractor));
5766
this.transcriber = transcriber ?? throw new ArgumentNullException(nameof(transcriber));
67+
this.segmentOptions = segmentOptions ?? SegmentOptions.Default;
5868
}
5969

6070
public int Priority => 460;
@@ -86,6 +96,69 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
8696
var bytes = memory.ToArray();
8797

8898
var metadata = await metadataExtractor.ExtractAsync(bytes, streamInfo, cancellationToken).ConfigureAwait(false);
99+
var transcript = await TryTranscribeAsync(bytes, streamInfo, cancellationToken).ConfigureAwait(false);
100+
101+
var segments = BuildSegments(metadata, transcript, streamInfo);
102+
var markdown = segments.Count > 0
103+
? SegmentMarkdownComposer.Compose(segments, segmentOptions)
104+
: "*No audio metadata available.*";
105+
106+
var title = metadata.TryGetValue("Title", out var t) && !string.IsNullOrWhiteSpace(t)
107+
? t.Trim()
108+
: streamInfo.FileName is not null ? Path.GetFileNameWithoutExtension(streamInfo.FileName) : null;
109+
110+
return new DocumentConverterResult(markdown, title, segments);
111+
}
112+
113+
private async Task<string?> TryTranscribeAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken)
114+
{
115+
try
116+
{
117+
return await transcriber.TranscribeAsync(audioBytes, streamInfo, cancellationToken).ConfigureAwait(false);
118+
}
119+
catch
120+
{
121+
return null;
122+
}
123+
}
124+
125+
private IReadOnlyList<DocumentSegment> BuildSegments(
126+
IReadOnlyDictionary<string, string> metadata,
127+
string? transcript,
128+
StreamInfo streamInfo)
129+
{
130+
var segments = new List<DocumentSegment>();
131+
var source = streamInfo.FileName;
132+
133+
var metadataMarkdown = BuildMetadataMarkdown(metadata);
134+
if (!string.IsNullOrWhiteSpace(metadataMarkdown))
135+
{
136+
segments.Add(new DocumentSegment(
137+
markdown: metadataMarkdown,
138+
type: SegmentType.Metadata,
139+
label: "Metadata",
140+
source: source));
141+
}
142+
143+
var audioDuration = ParseAudioDuration(metadata);
144+
var transcriptSegments = CreateAudioSegments(transcript, audioDuration, streamInfo);
145+
146+
if (transcriptSegments.Count > 0)
147+
{
148+
segments.Add(new DocumentSegment(
149+
markdown: "### Audio Transcript",
150+
type: SegmentType.Section,
151+
label: "Audio Transcript",
152+
source: source));
153+
154+
segments.AddRange(transcriptSegments);
155+
}
156+
157+
return segments;
158+
}
159+
160+
private static string BuildMetadataMarkdown(IReadOnlyDictionary<string, string> metadata)
161+
{
89162
var builder = new StringBuilder();
90163

91164
foreach (var field in MetadataFields)
@@ -96,39 +169,191 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
96169
}
97170
}
98171

99-
var transcript = await TryTranscribeAsync(bytes, streamInfo, cancellationToken).ConfigureAwait(false);
100-
if (!string.IsNullOrWhiteSpace(transcript))
172+
return builder.ToString().TrimEnd();
173+
}
174+
175+
private IReadOnlyList<DocumentSegment> CreateAudioSegments(string? transcript, TimeSpan? totalDuration, StreamInfo streamInfo)
176+
{
177+
if (string.IsNullOrWhiteSpace(transcript))
178+
{
179+
return Array.Empty<DocumentSegment>();
180+
}
181+
182+
var cleanedTranscript = transcript.Trim();
183+
var duration = totalDuration.GetValueOrDefault();
184+
var segmentDuration = segmentOptions.Audio.SegmentDuration;
185+
186+
if (!totalDuration.HasValue || duration <= TimeSpan.Zero || segmentDuration <= TimeSpan.Zero)
187+
{
188+
var metadata = totalDuration.HasValue
189+
? new Dictionary<string, string> { ["totalDuration"] = FormatDuration(duration) }
190+
: null;
191+
192+
return new List<DocumentSegment>
193+
{
194+
new DocumentSegment(
195+
markdown: cleanedTranscript,
196+
type: SegmentType.Audio,
197+
number: 1,
198+
label: "Segment 1",
199+
startTime: totalDuration.HasValue ? TimeSpan.Zero : null,
200+
endTime: totalDuration,
201+
source: streamInfo.FileName,
202+
additionalMetadata: metadata)
203+
};
204+
}
205+
206+
var segmentCount = Math.Max(1, (int)Math.Ceiling(duration.TotalSeconds / segmentDuration.TotalSeconds));
207+
var lengthBasedCap = Math.Max(1, cleanedTranscript.Length / 500);
208+
segmentCount = Math.Min(segmentCount, lengthBasedCap);
209+
210+
var chunks = SplitTranscriptIntoChunks(cleanedTranscript, segmentCount);
211+
var segments = new List<DocumentSegment>(chunks.Count);
212+
213+
for (var i = 0; i < chunks.Count; i++)
214+
{
215+
var start = segmentDuration * i;
216+
if (start > duration)
217+
{
218+
start = duration;
219+
}
220+
221+
var end = segmentDuration * (i + 1);
222+
if (end > duration)
223+
{
224+
end = duration;
225+
}
226+
227+
if (i == chunks.Count - 1 && end < duration)
228+
{
229+
end = duration;
230+
}
231+
232+
var metadata = new Dictionary<string, string>
233+
{
234+
["segment"] = (i + 1).ToString(CultureInfo.InvariantCulture),
235+
["totalDuration"] = FormatDuration(duration)
236+
};
237+
238+
segments.Add(new DocumentSegment(
239+
markdown: chunks[i],
240+
type: SegmentType.Audio,
241+
number: i + 1,
242+
label: $"Segment {i + 1}",
243+
startTime: start,
244+
endTime: end,
245+
source: streamInfo.FileName,
246+
additionalMetadata: metadata));
247+
}
248+
249+
return segments;
250+
}
251+
252+
private static List<string> SplitTranscriptIntoChunks(string transcript, int segmentCount)
253+
{
254+
if (segmentCount <= 1)
101255
{
102-
if (builder.Length > 0)
256+
return new List<string> { transcript };
257+
}
258+
259+
var segments = new List<string>(segmentCount);
260+
var length = transcript.Length;
261+
var chunkSize = Math.Max(1, (int)Math.Ceiling((double)length / segmentCount));
262+
var position = 0;
263+
264+
var breakChars = new[] { '.', '!', '?', '\n', '\r', ' ' };
265+
266+
while (position < length)
267+
{
268+
var end = Math.Min(position + chunkSize, length);
269+
if (end < length)
103270
{
104-
builder.AppendLine();
271+
var searchLength = Math.Min(chunkSize, 200);
272+
var splitIndex = transcript.LastIndexOfAny(breakChars, end - 1, searchLength);
273+
if (splitIndex > position)
274+
{
275+
end = splitIndex + 1;
276+
}
105277
}
106278

107-
builder.AppendLine("### Audio Transcript");
108-
builder.AppendLine();
109-
builder.AppendLine(transcript.Trim());
279+
var chunk = transcript[position..end].Trim();
280+
if (!string.IsNullOrEmpty(chunk))
281+
{
282+
segments.Add(chunk);
283+
}
284+
285+
position = end;
110286
}
111287

112-
var markdown = builder.Length > 0 ? builder.ToString().TrimEnd() : "*No audio metadata available.*";
113-
var title = metadata.TryGetValue("Title", out var t) && !string.IsNullOrWhiteSpace(t)
114-
? t.Trim()
115-
: streamInfo.FileName is not null ? Path.GetFileNameWithoutExtension(streamInfo.FileName) : null;
288+
if (segments.Count == 0)
289+
{
290+
segments.Add(transcript);
291+
}
116292

117-
return new DocumentConverterResult(markdown, title);
293+
return segments;
118294
}
119295

120-
private async Task<string?> TryTranscribeAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken)
296+
private static TimeSpan? ParseAudioDuration(IReadOnlyDictionary<string, string> metadata)
121297
{
122-
try
298+
if (metadata.TryGetValue("Duration", out var value) && TryParseDuration(value, out var duration))
123299
{
124-
return await transcriber.TranscribeAsync(audioBytes, streamInfo, cancellationToken).ConfigureAwait(false);
300+
return duration;
125301
}
126-
catch
302+
303+
if (metadata.TryGetValue("MediaDuration", out value) && TryParseDuration(value, out duration))
127304
{
128-
return null;
305+
return duration;
129306
}
307+
308+
return null;
130309
}
131310

311+
private static bool TryParseDuration(string rawValue, out TimeSpan duration)
312+
{
313+
var value = rawValue.Trim();
314+
var colonCount = 0;
315+
foreach (var ch in value)
316+
{
317+
if (ch == ':')
318+
{
319+
colonCount++;
320+
}
321+
}
322+
323+
if (colonCount == 1 && TimeSpan.TryParseExact(value, @"mm\:ss", CultureInfo.InvariantCulture, out duration))
324+
{
325+
return true;
326+
}
327+
328+
if (colonCount == 2 && TimeSpan.TryParseExact(value, @"hh\:mm\:ss", CultureInfo.InvariantCulture, out duration))
329+
{
330+
return true;
331+
}
332+
333+
if (TimeSpan.TryParse(value, CultureInfo.InvariantCulture, out duration))
334+
{
335+
return true;
336+
}
337+
338+
var sanitized = value.Replace(',', '.');
339+
if (sanitized.EndsWith("s", StringComparison.OrdinalIgnoreCase))
340+
{
341+
sanitized = sanitized[..^1];
342+
}
343+
344+
if (double.TryParse(sanitized, NumberStyles.Float, CultureInfo.InvariantCulture, out var seconds))
345+
{
346+
duration = TimeSpan.FromSeconds(seconds);
347+
return true;
348+
}
349+
350+
duration = default;
351+
return false;
352+
}
353+
354+
private static string FormatDuration(TimeSpan duration)
355+
=> duration.ToString(duration.TotalHours >= 1 ? @"hh\:mm\:ss" : @"mm\:ss", CultureInfo.InvariantCulture);
356+
132357
internal interface IAudioMetadataExtractor
133358
{
134359
Task<IReadOnlyDictionary<string, string>> ExtractAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken);

0 commit comments

Comments
 (0)