1+ using System ;
12using System . Collections . Generic ;
3+ using System . Globalization ;
4+ using System . IO ;
25using System . Text ;
36using System . Threading ;
47using System . Threading . Tasks ;
@@ -44,17 +47,24 @@ public sealed class AudioConverter : IDocumentConverter
4447
4548 private readonly IAudioMetadataExtractor metadataExtractor ;
4649 private readonly IAudioTranscriber transcriber ;
47-
48- public AudioConverter ( string ? exifToolPath = null , Func < byte [ ] , StreamInfo , CancellationToken , Task < string ? > > ? transcribeAsync = null )
49- : this ( new ExifToolAudioMetadataExtractor ( exifToolPath ) ,
50- transcribeAsync is null ? NoOpAudioTranscriber . Instance : new DelegateAudioTranscriber ( transcribeAsync ) )
50+ private readonly SegmentOptions segmentOptions ;
51+
52+ public AudioConverter (
53+ string ? exifToolPath = null ,
54+ Func < byte [ ] , StreamInfo , CancellationToken , Task < string ? > > ? transcribeAsync = null ,
55+ SegmentOptions ? segmentOptions = null )
56+ : this (
57+ new ExifToolAudioMetadataExtractor ( exifToolPath ) ,
58+ transcribeAsync is null ? NoOpAudioTranscriber . Instance : new DelegateAudioTranscriber ( transcribeAsync ) ,
59+ segmentOptions )
5160 {
5261 }
5362
54- internal AudioConverter ( IAudioMetadataExtractor metadataExtractor , IAudioTranscriber transcriber )
63+ internal AudioConverter ( IAudioMetadataExtractor metadataExtractor , IAudioTranscriber transcriber , SegmentOptions ? segmentOptions = null )
5564 {
5665 this . metadataExtractor = metadataExtractor ?? throw new ArgumentNullException ( nameof ( metadataExtractor ) ) ;
5766 this . transcriber = transcriber ?? throw new ArgumentNullException ( nameof ( transcriber ) ) ;
67+ this . segmentOptions = segmentOptions ?? SegmentOptions . Default ;
5868 }
5969
6070 public int Priority => 460 ;
@@ -86,6 +96,69 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
8696 var bytes = memory . ToArray ( ) ;
8797
8898 var metadata = await metadataExtractor . ExtractAsync ( bytes , streamInfo , cancellationToken ) . ConfigureAwait ( false ) ;
99+ var transcript = await TryTranscribeAsync ( bytes , streamInfo , cancellationToken ) . ConfigureAwait ( false ) ;
100+
101+ var segments = BuildSegments ( metadata , transcript , streamInfo ) ;
102+ var markdown = segments . Count > 0
103+ ? SegmentMarkdownComposer . Compose ( segments , segmentOptions )
104+ : "*No audio metadata available.*" ;
105+
106+ var title = metadata . TryGetValue ( "Title" , out var t ) && ! string . IsNullOrWhiteSpace ( t )
107+ ? t . Trim ( )
108+ : streamInfo . FileName is not null ? Path . GetFileNameWithoutExtension ( streamInfo . FileName ) : null ;
109+
110+ return new DocumentConverterResult ( markdown , title , segments ) ;
111+ }
112+
113+ private async Task < string ? > TryTranscribeAsync ( byte [ ] audioBytes , StreamInfo streamInfo , CancellationToken cancellationToken )
114+ {
115+ try
116+ {
117+ return await transcriber . TranscribeAsync ( audioBytes , streamInfo , cancellationToken ) . ConfigureAwait ( false ) ;
118+ }
119+ catch
120+ {
121+ return null ;
122+ }
123+ }
124+
125+ private IReadOnlyList < DocumentSegment > BuildSegments (
126+ IReadOnlyDictionary < string , string > metadata ,
127+ string ? transcript ,
128+ StreamInfo streamInfo )
129+ {
130+ var segments = new List < DocumentSegment > ( ) ;
131+ var source = streamInfo . FileName ;
132+
133+ var metadataMarkdown = BuildMetadataMarkdown ( metadata ) ;
134+ if ( ! string . IsNullOrWhiteSpace ( metadataMarkdown ) )
135+ {
136+ segments . Add ( new DocumentSegment (
137+ markdown : metadataMarkdown ,
138+ type : SegmentType . Metadata ,
139+ label : "Metadata" ,
140+ source : source ) ) ;
141+ }
142+
143+ var audioDuration = ParseAudioDuration ( metadata ) ;
144+ var transcriptSegments = CreateAudioSegments ( transcript , audioDuration , streamInfo ) ;
145+
146+ if ( transcriptSegments . Count > 0 )
147+ {
148+ segments . Add ( new DocumentSegment (
149+ markdown : "### Audio Transcript" ,
150+ type : SegmentType . Section ,
151+ label : "Audio Transcript" ,
152+ source : source ) ) ;
153+
154+ segments . AddRange ( transcriptSegments ) ;
155+ }
156+
157+ return segments ;
158+ }
159+
160+ private static string BuildMetadataMarkdown ( IReadOnlyDictionary < string , string > metadata )
161+ {
89162 var builder = new StringBuilder ( ) ;
90163
91164 foreach ( var field in MetadataFields )
@@ -96,39 +169,191 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
96169 }
97170 }
98171
99- var transcript = await TryTranscribeAsync ( bytes , streamInfo , cancellationToken ) . ConfigureAwait ( false ) ;
100- if ( ! string . IsNullOrWhiteSpace ( transcript ) )
172+ return builder . ToString ( ) . TrimEnd ( ) ;
173+ }
174+
175+ private IReadOnlyList < DocumentSegment > CreateAudioSegments ( string ? transcript , TimeSpan ? totalDuration , StreamInfo streamInfo )
176+ {
177+ if ( string . IsNullOrWhiteSpace ( transcript ) )
178+ {
179+ return Array . Empty < DocumentSegment > ( ) ;
180+ }
181+
182+ var cleanedTranscript = transcript . Trim ( ) ;
183+ var duration = totalDuration . GetValueOrDefault ( ) ;
184+ var segmentDuration = segmentOptions . Audio . SegmentDuration ;
185+
186+ if ( ! totalDuration . HasValue || duration <= TimeSpan . Zero || segmentDuration <= TimeSpan . Zero )
187+ {
188+ var metadata = totalDuration . HasValue
189+ ? new Dictionary < string , string > { [ "totalDuration" ] = FormatDuration ( duration ) }
190+ : null ;
191+
192+ return new List < DocumentSegment >
193+ {
194+ new DocumentSegment (
195+ markdown : cleanedTranscript ,
196+ type : SegmentType . Audio ,
197+ number : 1 ,
198+ label : "Segment 1" ,
199+ startTime : totalDuration . HasValue ? TimeSpan . Zero : null ,
200+ endTime : totalDuration ,
201+ source : streamInfo . FileName ,
202+ additionalMetadata : metadata )
203+ } ;
204+ }
205+
206+ var segmentCount = Math . Max ( 1 , ( int ) Math . Ceiling ( duration . TotalSeconds / segmentDuration . TotalSeconds ) ) ;
207+ var lengthBasedCap = Math . Max ( 1 , cleanedTranscript . Length / 500 ) ;
208+ segmentCount = Math . Min ( segmentCount , lengthBasedCap ) ;
209+
210+ var chunks = SplitTranscriptIntoChunks ( cleanedTranscript , segmentCount ) ;
211+ var segments = new List < DocumentSegment > ( chunks . Count ) ;
212+
213+ for ( var i = 0 ; i < chunks . Count ; i ++ )
214+ {
215+ var start = segmentDuration * i ;
216+ if ( start > duration )
217+ {
218+ start = duration ;
219+ }
220+
221+ var end = segmentDuration * ( i + 1 ) ;
222+ if ( end > duration )
223+ {
224+ end = duration ;
225+ }
226+
227+ if ( i == chunks . Count - 1 && end < duration )
228+ {
229+ end = duration ;
230+ }
231+
232+ var metadata = new Dictionary < string , string >
233+ {
234+ [ "segment" ] = ( i + 1 ) . ToString ( CultureInfo . InvariantCulture ) ,
235+ [ "totalDuration" ] = FormatDuration ( duration )
236+ } ;
237+
238+ segments . Add ( new DocumentSegment (
239+ markdown : chunks [ i ] ,
240+ type : SegmentType . Audio ,
241+ number : i + 1 ,
242+ label : $ "Segment { i + 1 } ",
243+ startTime : start ,
244+ endTime : end ,
245+ source : streamInfo . FileName ,
246+ additionalMetadata : metadata ) ) ;
247+ }
248+
249+ return segments ;
250+ }
251+
252+ private static List < string > SplitTranscriptIntoChunks ( string transcript , int segmentCount )
253+ {
254+ if ( segmentCount <= 1 )
101255 {
102- if ( builder . Length > 0 )
256+ return new List < string > { transcript } ;
257+ }
258+
259+ var segments = new List < string > ( segmentCount ) ;
260+ var length = transcript . Length ;
261+ var chunkSize = Math . Max ( 1 , ( int ) Math . Ceiling ( ( double ) length / segmentCount ) ) ;
262+ var position = 0 ;
263+
264+ var breakChars = new [ ] { '.' , '!' , '?' , '\n ' , '\r ' , ' ' } ;
265+
266+ while ( position < length )
267+ {
268+ var end = Math . Min ( position + chunkSize , length ) ;
269+ if ( end < length )
103270 {
104- builder . AppendLine ( ) ;
271+ var searchLength = Math . Min ( chunkSize , 200 ) ;
272+ var splitIndex = transcript . LastIndexOfAny ( breakChars , end - 1 , searchLength ) ;
273+ if ( splitIndex > position )
274+ {
275+ end = splitIndex + 1 ;
276+ }
105277 }
106278
107- builder . AppendLine ( "### Audio Transcript" ) ;
108- builder . AppendLine ( ) ;
109- builder . AppendLine ( transcript . Trim ( ) ) ;
279+ var chunk = transcript [ position ..end ] . Trim ( ) ;
280+ if ( ! string . IsNullOrEmpty ( chunk ) )
281+ {
282+ segments . Add ( chunk ) ;
283+ }
284+
285+ position = end ;
110286 }
111287
112- var markdown = builder . Length > 0 ? builder . ToString ( ) . TrimEnd ( ) : "*No audio metadata available.*" ;
113- var title = metadata . TryGetValue ( "Title" , out var t ) && ! string . IsNullOrWhiteSpace ( t )
114- ? t . Trim ( )
115- : streamInfo . FileName is not null ? Path . GetFileNameWithoutExtension ( streamInfo . FileName ) : null ;
288+ if ( segments . Count == 0 )
289+ {
290+ segments . Add ( transcript ) ;
291+ }
116292
117- return new DocumentConverterResult ( markdown , title ) ;
293+ return segments ;
118294 }
119295
120- private async Task < string ? > TryTranscribeAsync ( byte [ ] audioBytes , StreamInfo streamInfo , CancellationToken cancellationToken )
296+ private static TimeSpan ? ParseAudioDuration ( IReadOnlyDictionary < string , string > metadata )
121297 {
122- try
298+ if ( metadata . TryGetValue ( "Duration" , out var value ) && TryParseDuration ( value , out var duration ) )
123299 {
124- return await transcriber . TranscribeAsync ( audioBytes , streamInfo , cancellationToken ) . ConfigureAwait ( false ) ;
300+ return duration ;
125301 }
126- catch
302+
303+ if ( metadata . TryGetValue ( "MediaDuration" , out value ) && TryParseDuration ( value , out duration ) )
127304 {
128- return null ;
305+ return duration ;
129306 }
307+
308+ return null ;
130309 }
131310
311+ private static bool TryParseDuration ( string rawValue , out TimeSpan duration )
312+ {
313+ var value = rawValue . Trim ( ) ;
314+ var colonCount = 0 ;
315+ foreach ( var ch in value )
316+ {
317+ if ( ch == ':' )
318+ {
319+ colonCount ++ ;
320+ }
321+ }
322+
323+ if ( colonCount == 1 && TimeSpan . TryParseExact ( value , @"mm\:ss" , CultureInfo . InvariantCulture , out duration ) )
324+ {
325+ return true ;
326+ }
327+
328+ if ( colonCount == 2 && TimeSpan . TryParseExact ( value , @"hh\:mm\:ss" , CultureInfo . InvariantCulture , out duration ) )
329+ {
330+ return true ;
331+ }
332+
333+ if ( TimeSpan . TryParse ( value , CultureInfo . InvariantCulture , out duration ) )
334+ {
335+ return true ;
336+ }
337+
338+ var sanitized = value . Replace ( ',' , '.' ) ;
339+ if ( sanitized . EndsWith ( "s" , StringComparison . OrdinalIgnoreCase ) )
340+ {
341+ sanitized = sanitized [ ..^ 1 ] ;
342+ }
343+
344+ if ( double . TryParse ( sanitized , NumberStyles . Float , CultureInfo . InvariantCulture , out var seconds ) )
345+ {
346+ duration = TimeSpan . FromSeconds ( seconds ) ;
347+ return true ;
348+ }
349+
350+ duration = default ;
351+ return false ;
352+ }
353+
354+ private static string FormatDuration ( TimeSpan duration )
355+ => duration . ToString ( duration . TotalHours >= 1 ? @"hh\:mm\:ss" : @"mm\:ss" , CultureInfo . InvariantCulture ) ;
356+
132357 internal interface IAudioMetadataExtractor
133358 {
134359 Task < IReadOnlyDictionary < string , string > > ExtractAsync ( byte [ ] audioBytes , StreamInfo streamInfo , CancellationToken cancellationToken ) ;
0 commit comments