mirror of https://github.com/HandBrake/HandBrake
634 lines
18 KiB
C
634 lines
18 KiB
C
/* decsrtsub.c
|
|
|
|
Copyright (c) 2003-2025 HandBrake Team
|
|
This file is part of the HandBrake source code
|
|
Homepage: <http://handbrake.fr/>.
|
|
It may be used under the terms of the GNU General Public License v2.
|
|
For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <iconv.h>
|
|
#include <errno.h>
|
|
#include "libavutil/avutil.h"
|
|
#include "handbrake/handbrake.h"
|
|
#include "handbrake/colormap.h"
|
|
#include "handbrake/decavsub.h"
|
|
#include "handbrake/extradata.h"
|
|
|
|
struct start_and_end {
|
|
unsigned long start, end;
|
|
};
|
|
|
|
enum
|
|
{
|
|
k_state_inEntry,
|
|
k_state_inEntry_or_new,
|
|
k_state_potential_new_entry,
|
|
k_state_timecode,
|
|
};
|
|
|
|
typedef struct srt_entry_s {
|
|
long offset, duration;
|
|
long start, stop;
|
|
char text[1024];
|
|
int pos;
|
|
} srt_entry_t;
|
|
|
|
/*
|
|
* Store all context in the work private struct,
|
|
*/
|
|
struct hb_work_private_s
|
|
{
|
|
hb_decavsub_context_t * ctx;
|
|
hb_job_t * job;
|
|
FILE * file;
|
|
char buf[1024];
|
|
int pos;
|
|
int end;
|
|
char utf8_buf[2048];
|
|
int utf8_pos;
|
|
int utf8_end;
|
|
int utf8_bom_skipped;
|
|
unsigned long current_time;
|
|
unsigned long number_of_entries;
|
|
unsigned long last_entry_number;
|
|
unsigned long current_state;
|
|
srt_entry_t current_entry;
|
|
iconv_t iconv_context;
|
|
hb_subtitle_t *subtitle;
|
|
uint64_t start_time; // In HB time
|
|
uint64_t stop_time; // In HB time
|
|
|
|
int line; // SSA line number
|
|
};
|
|
|
|
static int
|
|
read_time_from_string( const char* timeString, struct start_and_end *result )
|
|
{
|
|
// for ex. 00:00:15,248 --> 00:00:16,545
|
|
|
|
long hours1, minutes1, seconds1, milliseconds1,
|
|
hours2, minutes2, seconds2, milliseconds2;
|
|
int scanned;
|
|
|
|
scanned = sscanf(timeString, "%ld:%ld:%ld,%ld --> %ld:%ld:%ld,%ld\n",
|
|
&hours1, &minutes1, &seconds1, &milliseconds1,
|
|
&hours2, &minutes2, &seconds2, &milliseconds2);
|
|
if (scanned != 8)
|
|
{
|
|
scanned = sscanf(timeString, "%ld:%ld:%ld.%ld --> %ld:%ld:%ld.%ld\n",
|
|
&hours1, &minutes1, &seconds1, &milliseconds1,
|
|
&hours2, &minutes2, &seconds2, &milliseconds2);
|
|
if (scanned != 8)
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
result->start =
|
|
milliseconds1 + seconds1*1000 + minutes1*60*1000 + hours1*60*60*1000;
|
|
result->end =
|
|
milliseconds2 + seconds2*1000 + minutes2*60*1000 + hours2*60*60*1000;
|
|
return 1;
|
|
}
|
|
|
|
static int utf8_fill( hb_work_private_t * pv )
|
|
{
|
|
int bytes, conversion = 0;
|
|
size_t out_size;
|
|
|
|
/* Align utf8 data to beginning of the buffer so that we can
|
|
* fill the buffer to its maximum */
|
|
memmove( pv->utf8_buf, pv->utf8_buf + pv->utf8_pos, pv->utf8_end - pv->utf8_pos );
|
|
pv->utf8_end -= pv->utf8_pos;
|
|
pv->utf8_pos = 0;
|
|
out_size = 2048 - pv->utf8_end;
|
|
while( out_size )
|
|
{
|
|
char *p, *q;
|
|
size_t in_size, retval;
|
|
|
|
if( pv->end == pv->pos )
|
|
{
|
|
bytes = fread( pv->buf, 1, 1024, pv->file );
|
|
pv->pos = 0;
|
|
pv->end = bytes;
|
|
if( bytes == 0 )
|
|
{
|
|
return conversion;
|
|
}
|
|
}
|
|
|
|
p = pv->buf + pv->pos;
|
|
q = pv->utf8_buf + pv->utf8_end;
|
|
in_size = pv->end - pv->pos;
|
|
|
|
retval = iconv( pv->iconv_context, &p, &in_size, &q, &out_size);
|
|
if (q != pv->utf8_buf + pv->utf8_end)
|
|
{
|
|
conversion = 1;
|
|
}
|
|
|
|
pv->utf8_end = q - pv->utf8_buf;
|
|
pv->pos = p - pv->buf;
|
|
|
|
if ( !pv->utf8_bom_skipped )
|
|
{
|
|
uint8_t *buf = (uint8_t*)pv->utf8_buf;
|
|
if (buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
|
|
{
|
|
pv->utf8_pos = 3;
|
|
if (pv->utf8_pos == pv->utf8_end)
|
|
{
|
|
// If only the bom was converted, indicate no conversion
|
|
conversion = 0;
|
|
}
|
|
}
|
|
pv->utf8_bom_skipped = 1;
|
|
}
|
|
|
|
if( ( retval == -1 ) && ( errno == EINVAL ) )
|
|
{
|
|
/* Incomplete multibyte sequence, read more data */
|
|
memmove( pv->buf, p, pv->end - pv->pos );
|
|
pv->end -= pv->pos;
|
|
pv->pos = 0;
|
|
bytes = fread( pv->buf + pv->end, 1, 1024 - pv->end, pv->file );
|
|
if( bytes == 0 )
|
|
{
|
|
return conversion;
|
|
}
|
|
pv->end += bytes;
|
|
} else if ( ( retval == -1 ) && ( errno == EILSEQ ) )
|
|
{
|
|
hb_error( "Invalid byte for codeset in input, discard byte" );
|
|
/* Try the next byte of the input */
|
|
pv->pos++;
|
|
} else if ( ( retval == -1 ) && ( errno == E2BIG ) )
|
|
{
|
|
/* buffer full */
|
|
return conversion;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int get_line( hb_work_private_t * pv, char *buf, int size )
|
|
{
|
|
int i;
|
|
char c;
|
|
|
|
// clear remnants of the previous line before processing a new one
|
|
memset(buf, '\0', size);
|
|
|
|
/* Find newline in converted UTF-8 buffer */
|
|
for( i = 0; i < size - 1; i++ )
|
|
{
|
|
if( pv->utf8_pos >= pv->utf8_end )
|
|
{
|
|
if( !utf8_fill( pv ) )
|
|
{
|
|
if( i )
|
|
return 1;
|
|
else
|
|
return 0;
|
|
}
|
|
}
|
|
c = pv->utf8_buf[pv->utf8_pos++];
|
|
if( c == '\n' )
|
|
{
|
|
buf[i] = '\n';
|
|
buf[i+1] = '\0';
|
|
return 1;
|
|
}
|
|
buf[i] = c;
|
|
}
|
|
buf[0] = '\0';
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Read the SRT file and put the entries into the subtitle fifo for all to read
|
|
*/
|
|
static hb_buffer_t *srt_read( hb_work_private_t *pv )
|
|
{
|
|
char line_buffer[1024];
|
|
int reprocess = 0, resync = 0;
|
|
|
|
if (!pv->file)
|
|
{
|
|
return hb_buffer_eof_init();
|
|
}
|
|
|
|
if (pv->job->reader_pts_offset == AV_NOPTS_VALUE)
|
|
{
|
|
// We need to wait for reader to initialize it's pts offset so that
|
|
// we know where to start reading SRTs.
|
|
return NULL;
|
|
}
|
|
if (pv->start_time == AV_NOPTS_VALUE)
|
|
{
|
|
pv->start_time = pv->job->reader_pts_offset;
|
|
if (pv->job->pts_to_stop > 0)
|
|
{
|
|
pv->stop_time = pv->job->pts_to_start + pv->job->pts_to_stop;
|
|
}
|
|
}
|
|
|
|
while( reprocess || get_line( pv, line_buffer, sizeof( line_buffer ) ) )
|
|
{
|
|
reprocess = 0;
|
|
switch (pv->current_state)
|
|
{
|
|
case k_state_timecode:
|
|
{
|
|
struct start_and_end timing;
|
|
int result;
|
|
|
|
result = read_time_from_string( line_buffer, &timing );
|
|
if (!result)
|
|
{
|
|
resync = 1;
|
|
pv->current_state = k_state_potential_new_entry;
|
|
continue;
|
|
}
|
|
pv->current_entry.duration = timing.end - timing.start;
|
|
pv->current_entry.offset = timing.start - pv->current_time;
|
|
|
|
pv->current_time = timing.end;
|
|
|
|
pv->current_entry.start = timing.start;
|
|
pv->current_entry.stop = timing.end;
|
|
|
|
pv->current_state = k_state_inEntry;
|
|
continue;
|
|
}
|
|
|
|
case k_state_inEntry_or_new:
|
|
{
|
|
char *endpoint;
|
|
/*
|
|
* Is this really new next entry begin?
|
|
* Look for entry number.
|
|
*/
|
|
strtol(line_buffer, &endpoint, 10);
|
|
if (endpoint == line_buffer ||
|
|
(endpoint && *endpoint != '\n' && *endpoint != '\r'))
|
|
{
|
|
/*
|
|
* Doesn't resemble an entry number
|
|
* must still be in an entry
|
|
*/
|
|
if (!resync)
|
|
{
|
|
reprocess = 1;
|
|
pv->current_state = k_state_inEntry;
|
|
}
|
|
continue;
|
|
}
|
|
reprocess = 1;
|
|
pv->current_state = k_state_potential_new_entry;
|
|
break;
|
|
}
|
|
|
|
case k_state_inEntry:
|
|
{
|
|
char *q;
|
|
int size, len;
|
|
|
|
// If the current line is empty, we assume this is the
|
|
// separation between two entries. In case we are wrong,
|
|
// the mistake is corrected in the next state.
|
|
if (strcmp(line_buffer, "\n") == 0 || strcmp(line_buffer, "\r\n") == 0) {
|
|
pv->current_state = k_state_potential_new_entry;
|
|
continue;
|
|
}
|
|
|
|
q = pv->current_entry.text + pv->current_entry.pos;
|
|
len = strlen( line_buffer );
|
|
size = MIN(1024 - pv->current_entry.pos - 1, len );
|
|
memcpy(q, line_buffer, size);
|
|
pv->current_entry.pos += size;
|
|
pv->current_entry.text[pv->current_entry.pos] = '\0';
|
|
break;
|
|
}
|
|
|
|
case k_state_potential_new_entry:
|
|
{
|
|
char *endpoint;
|
|
long entry_number;
|
|
hb_buffer_t *buffer = NULL;
|
|
/*
|
|
* Is this really new next entry begin?
|
|
*/
|
|
entry_number = strtol(line_buffer, &endpoint, 10);
|
|
if (!resync && (*line_buffer == '\n' || *line_buffer == '\r'))
|
|
{
|
|
/*
|
|
* Well.. looks like we are in the wrong mode.. lets add the
|
|
* newline we misinterpreted...
|
|
*/
|
|
strncat(pv->current_entry.text, " ", sizeof(pv->current_entry.text) - strlen(pv->current_entry.text) - 1);
|
|
pv->current_state = k_state_inEntry_or_new;
|
|
continue;
|
|
}
|
|
if (endpoint == line_buffer ||
|
|
(endpoint && *endpoint != '\n' && *endpoint != '\r'))
|
|
{
|
|
/*
|
|
* Well.. looks like we are in the wrong mode.. lets add the
|
|
* line we misinterpreted...
|
|
*/
|
|
if (!resync)
|
|
{
|
|
reprocess = 1;
|
|
pv->current_state = k_state_inEntry;
|
|
}
|
|
continue;
|
|
}
|
|
/*
|
|
* We found the next entry - or a really rare error condition
|
|
*/
|
|
pv->last_entry_number = entry_number;
|
|
resync = 0;
|
|
if (*pv->current_entry.text != '\0')
|
|
{
|
|
long length = 0;
|
|
char *p, *q;
|
|
uint64_t start_time = ( pv->current_entry.start +
|
|
pv->subtitle->config.offset ) * 90;
|
|
uint64_t stop_time = ( pv->current_entry.stop +
|
|
pv->subtitle->config.offset ) * 90;
|
|
|
|
// Drop subtitles that end before the start time
|
|
// or start after the stop time
|
|
if (stop_time <= pv->start_time ||
|
|
start_time >= pv->stop_time)
|
|
{
|
|
hb_deep_log( 3, "Discarding SRT at time start %"PRId64", stop %"PRId64, start_time, stop_time);
|
|
memset( &pv->current_entry, 0, sizeof( srt_entry_t ) );
|
|
++(pv->number_of_entries);
|
|
pv->current_state = k_state_timecode;
|
|
continue;
|
|
}
|
|
if (start_time < pv->start_time)
|
|
{
|
|
start_time = pv->start_time;
|
|
}
|
|
if (stop_time > pv->stop_time)
|
|
{
|
|
stop_time = pv->stop_time;
|
|
}
|
|
|
|
for (q = p = pv->current_entry.text; *p != '\0'; p++)
|
|
{
|
|
if (*p == '\r' || *p == '\n')
|
|
{
|
|
// sanitize newline codes to a single '\n'
|
|
// and delete any newline at the end of the subtitle
|
|
if (*(p + 1) != '\n' && *(p + 1) != '\r' &&
|
|
*(p + 1) != '\0')
|
|
{
|
|
*q = '\n';
|
|
q++;
|
|
length++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*q = *p;
|
|
q++;
|
|
length++;
|
|
}
|
|
}
|
|
*q = '\0';
|
|
|
|
buffer = hb_buffer_init( length + 1 );
|
|
|
|
if( buffer )
|
|
{
|
|
buffer->s.start = start_time - pv->start_time;
|
|
buffer->s.stop = stop_time - pv->start_time;
|
|
|
|
memcpy( buffer->data, pv->current_entry.text, length + 1 );
|
|
}
|
|
}
|
|
memset( &pv->current_entry, 0, sizeof( srt_entry_t ) );
|
|
++(pv->number_of_entries);
|
|
pv->current_state = k_state_timecode;
|
|
if( buffer )
|
|
{
|
|
return buffer;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
hb_buffer_t *buffer = NULL;
|
|
if (*pv->current_entry.text != '\0')
|
|
{
|
|
long length = 0;
|
|
char *p, *q;
|
|
uint64_t start_time = ( pv->current_entry.start +
|
|
pv->subtitle->config.offset ) * 90;
|
|
uint64_t stop_time = ( pv->current_entry.stop +
|
|
pv->subtitle->config.offset ) * 90;
|
|
|
|
if( !( start_time >= pv->start_time && stop_time < pv->stop_time ) )
|
|
{
|
|
hb_deep_log( 3, "Discarding SRT at time start %"PRId64", stop %"PRId64, start_time, stop_time);
|
|
memset( &pv->current_entry, 0, sizeof( srt_entry_t ) );
|
|
return hb_buffer_eof_init();
|
|
}
|
|
|
|
for (q = p = pv->current_entry.text; *p != '\0'; p++)
|
|
{
|
|
if (*p == '\r' || *p == '\n')
|
|
{
|
|
// sanitize newline codes to a single '\n'
|
|
// and delete any newline at the end of the subtitle
|
|
if (*(p + 1) != '\n' && *(p + 1) != '\r' && *(p + 1) != '\0')
|
|
{
|
|
*q = '\n';
|
|
q++;
|
|
length++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*q = *p;
|
|
q++;
|
|
length++;
|
|
}
|
|
}
|
|
*q = '\0';
|
|
|
|
buffer = hb_buffer_init( length + 1 );
|
|
|
|
if( buffer )
|
|
{
|
|
buffer->s.start = start_time - pv->start_time;
|
|
buffer->s.stop = stop_time - pv->start_time;
|
|
|
|
memcpy( buffer->data, pv->current_entry.text, length + 1 );
|
|
}
|
|
}
|
|
memset( &pv->current_entry, 0, sizeof( srt_entry_t ) );
|
|
if( buffer )
|
|
{
|
|
return buffer;
|
|
}
|
|
|
|
return hb_buffer_eof_init();
|
|
}
|
|
|
|
static int decsrtInit( hb_work_object_t * w, hb_job_t * job )
|
|
{
|
|
hb_work_private_t * pv;
|
|
int i;
|
|
hb_chapter_t * chapter;
|
|
|
|
pv = calloc( 1, sizeof( hb_work_private_t ) );
|
|
if (pv == NULL)
|
|
{
|
|
goto fail;
|
|
}
|
|
|
|
// For SRT files, libav will generate an SSA subtitle header for us.
|
|
// We will copy it into subtitle extradata after the decoder is
|
|
// initialized
|
|
|
|
pv->ctx = decavsubInit(w, job);
|
|
if (pv->ctx == NULL)
|
|
{
|
|
goto fail;
|
|
}
|
|
|
|
w->private_data = pv;
|
|
|
|
pv->job = job;
|
|
pv->current_state = k_state_potential_new_entry;
|
|
pv->number_of_entries = 0;
|
|
pv->last_entry_number = 0;
|
|
pv->current_time = 0;
|
|
pv->subtitle = w->subtitle;
|
|
|
|
/*
|
|
* Figure out the start and stop times from the chapters being
|
|
* encoded - drop subtitle not in this range.
|
|
*/
|
|
pv->start_time = 0;
|
|
for( i = 1; i < job->chapter_start; ++i )
|
|
{
|
|
chapter = hb_list_item( job->list_chapter, i - 1 );
|
|
if( chapter )
|
|
{
|
|
pv->start_time += chapter->duration;
|
|
} else {
|
|
hb_error( "Could not locate chapter %d for SRT start time", i );
|
|
}
|
|
}
|
|
pv->stop_time = pv->start_time;
|
|
for( i = job->chapter_start; i <= job->chapter_end; ++i )
|
|
{
|
|
chapter = hb_list_item( job->list_chapter, i - 1 );
|
|
if( chapter )
|
|
{
|
|
pv->stop_time += chapter->duration;
|
|
} else {
|
|
hb_error( "Could not locate chapter %d for SRT start time", i );
|
|
}
|
|
}
|
|
|
|
hb_deep_log(3, "SRT Start time %"PRId64", stop time %"PRId64,
|
|
pv->start_time, pv->stop_time);
|
|
|
|
if (job->pts_to_start != 0)
|
|
{
|
|
pv->start_time = AV_NOPTS_VALUE;
|
|
}
|
|
|
|
pv->iconv_context = iconv_open( "utf-8", pv->subtitle->config.src_codeset );
|
|
if( pv->iconv_context == (iconv_t) -1 )
|
|
{
|
|
hb_error("Could not open the iconv library with those file formats\n");
|
|
goto fail;
|
|
} else {
|
|
memset( &pv->current_entry, 0, sizeof( srt_entry_t ) );
|
|
|
|
pv->file = hb_fopen(w->subtitle->config.src_filename, "r");
|
|
|
|
if( !pv->file )
|
|
{
|
|
hb_error("Could not open the SRT subtitle file '%s'\n",
|
|
w->subtitle->config.src_filename);
|
|
goto fail;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
|
|
fail:
|
|
if (pv != NULL)
|
|
{
|
|
decavsubClose(pv->ctx);
|
|
if (pv->iconv_context != (iconv_t) -1)
|
|
{
|
|
iconv_close(pv->iconv_context);
|
|
}
|
|
if (pv->file != NULL)
|
|
{
|
|
fclose(pv->file);
|
|
}
|
|
free(pv);
|
|
w->private_data = NULL;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int decsrtWork( hb_work_object_t * w, hb_buffer_t ** buf_in,
|
|
hb_buffer_t ** buf_out )
|
|
{
|
|
hb_work_private_t * pv = w->private_data;
|
|
hb_buffer_t * in;
|
|
int result;
|
|
|
|
in = srt_read( pv );
|
|
if (in == NULL)
|
|
{
|
|
return HB_WORK_OK;
|
|
}
|
|
|
|
result = decavsubWork(pv->ctx, &in, buf_out);
|
|
if (in != NULL)
|
|
{
|
|
hb_buffer_close(&in);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static void decsrtClose( hb_work_object_t * w )
|
|
{
|
|
hb_work_private_t * pv = w->private_data;
|
|
if (pv != NULL)
|
|
{
|
|
decavsubClose(pv->ctx);
|
|
fclose( pv->file );
|
|
iconv_close(pv->iconv_context);
|
|
free( w->private_data );
|
|
}
|
|
w->private_data = NULL;
|
|
}
|
|
|
|
hb_work_object_t hb_decsrtsub =
|
|
{
|
|
WORK_DECSRTSUB,
|
|
"SRT Subtitle Decoder",
|
|
decsrtInit,
|
|
decsrtWork,
|
|
decsrtClose
|
|
};
|