`subtitle_utils`¶

subtitle_utils ¶

Subtitle file parsing and serialization utilities.

Supports SRT, VTT (WebVTT), ASS, and SSA formats. Each format has a parse/serialize pair. The unified parse_subtitle / serialize_subtitle dispatchers select the correct pair based on file extension.

SubtitleEntry `dataclass` ¶

SubtitleEntry(index, start, end, text, raw_text='', metadata=dict())

A single subtitle cue with timing metadata and translatable text.

ATTRIBUTE	DESCRIPTION
`index`	Sequential position (0-based). TYPE: `int`
`start`	Start timestamp as the original raw string. TYPE: `str`
`end`	End timestamp as the original raw string. TYPE: `str`
`text`	Translatable text (override tags stripped for ASS/SSA). TYPE: `str`
`raw_text`	Original text before tag stripping (ASS/SSA only). TYPE: `str`
`metadata`	Format-specific extra data (cue id, cue settings, etc.). TYPE: `dict[str, str]`

mirror_ass_alignment_for_rtl ¶

mirror_ass_alignment_for_rtl(ass_text)

Mirrors ASS/SSA alignment codes left↔right for an RTL target.

Flips both the per-line override tags (\an1 ↔ \an3 etc.) and the V4+ Style table's Alignment column. Centre alignments (\an2/5/8, legacy \a2/6/10) are untouched.

The function is a string-level rewrite — it doesn't validate ASS structure, so an unrelated Style: row outside [V4+ Styles] won't be touched (the column count would be wrong) but a malformed file won't crash either.

Source code in src/utils/subtitle_utils.py

def mirror_ass_alignment_for_rtl(ass_text: str) -> str:
    r"""Mirrors ASS/SSA alignment codes left↔right for an RTL target.

    Flips both the per-line override tags (``\an1`` ↔ ``\an3`` etc.)
    and the V4+ Style table's ``Alignment`` column.  Centre alignments
    (``\an2/5/8``, legacy ``\a2/6/10``) are untouched.

    The function is a string-level rewrite — it doesn't validate ASS
    structure, so an unrelated ``Style:`` row outside ``[V4+ Styles]``
    won't be touched (the column count would be wrong) but a malformed
    file won't crash either.
    """
    def _flip_an(match: re.Match[str]) -> str:
        digit = match.group(1)
        return r"\an" + _ASS_AN_MIRROR.get(digit, digit)

    def _flip_legacy(match: re.Match[str]) -> str:
        digit = match.group(1)
        return r"\a" + _ASS_LEGACY_MIRROR.get(digit, digit)

    out_lines: list[str] = []
    for line in ass_text.splitlines(keepends=True):
        # Override-tag rewrite applies to any line that may carry tags.
        new_line = _ASS_ALIGN_AN_RE.sub(_flip_an, line)
        new_line = _ASS_ALIGN_LEGACY_RE.sub(_flip_legacy, new_line)
        # Style row: rewrite the Alignment column when the row matches
        # the canonical libass Format (10 fixed fields up to Alignment +
        # 13 trailing fields = 23 columns total).  We only touch rows
        # that look like real Style rows to avoid corrupting anything.
        stripped = new_line.lstrip()
        if stripped.lower().startswith("style:"):
            prefix_len = len(new_line) - len(stripped)
            after_kw = stripped.split(":", 1)[1]
            cols = after_kw.split(",")
            if len(cols) > _ASS_STYLE_ALIGNMENT_INDEX:
                col = cols[_ASS_STYLE_ALIGNMENT_INDEX].strip()
                mirrored = _ASS_AN_MIRROR.get(col)
                if mirrored is not None:
                    cols[_ASS_STYLE_ALIGNMENT_INDEX] = (
                        cols[_ASS_STYLE_ALIGNMENT_INDEX].replace(col, mirrored, 1)
                    )
                    rebuilt = "Style:" + ",".join(cols)
                    new_line = " " * prefix_len + rebuilt
        out_lines.append(new_line)
    return "".join(out_lines)

is_subtitle_format ¶

is_subtitle_format(suffix)

Returns True if suffix is a supported subtitle extension.

Source code in src/utils/subtitle_utils.py

def is_subtitle_format(suffix: str) -> bool:
    """Returns True if *suffix* is a supported subtitle extension."""
    return suffix in _SUBTITLE_EXTENSIONS

parse_srt ¶

parse_srt(content)

Parses an SRT file into subtitle entries.

PARAMETER	DESCRIPTION
`content`	Raw SRT file content. TYPE: `str`

RETURNS	DESCRIPTION
`list[SubtitleEntry]`	Tuple of (entries, None). The second element is always `None`
`None`	because SRT needs no extra data for serialization.

Source code in src/utils/subtitle_utils.py

def parse_srt(content: str) -> tuple[list[SubtitleEntry], None]:
    """Parses an SRT file into subtitle entries.

    Args:
        content: Raw SRT file content.

    Returns:
        Tuple of (entries, None).  The second element is always ``None``
        because SRT needs no extra data for serialization.
    """
    content = strip_bom(content)
    # Normalize Windows/Mac line endings to Unix
    content = content.replace("\r\n", "\n").replace("\r", "\n")
    # Strip only leading/trailing newlines, preserving internal whitespace
    content = content.strip("\n")
    # Split on one or more blank lines
    blocks = re.split(r"\n\n+", content)
    entries: list[SubtitleEntry] = []

    entry_idx = 0
    for block in blocks:
        lines = block.strip("\n").splitlines()
        if len(lines) < 2:  # noqa: PLR2004
            continue

        # Find the timestamp line (might be line 0 or line 1)
        ts_line_idx = -1
        for i, line in enumerate(lines):
            if _TIMESTAMP_RE.search(line):
                ts_line_idx = i
                break

        if ts_line_idx < 0:
            continue  # Not a valid cue block

        # Parse timestamp: "start --> end"
        ts_parts = lines[ts_line_idx].split("-->")
        if len(ts_parts) != 2:  # noqa: PLR2004
            continue

        start = ts_parts[0].strip()
        end = ts_parts[1].strip()
        text = "\n".join(lines[ts_line_idx + 1 :])

        if not text.strip():
            continue

        entries.append(
            SubtitleEntry(
                index=entry_idx,
                start=start,
                end=end,
                text=text,
            ),
        )
        entry_idx += 1

    return entries, None

serialize_srt ¶

serialize_srt(entries, _format_data=None)

Reconstructs an SRT file from subtitle entries.

PARAMETER	DESCRIPTION
`entries`	Subtitle entries with (possibly translated) text. TYPE: `list[SubtitleEntry]`
`_format_data`	Unused — present for dispatcher signature consistency. TYPE: `None` DEFAULT: `None`

RETURNS	DESCRIPTION
`str`	Complete SRT file content.

Source code in src/utils/subtitle_utils.py

def serialize_srt(entries: list[SubtitleEntry], _format_data: None = None) -> str:
    """Reconstructs an SRT file from subtitle entries.

    Args:
        entries: Subtitle entries with (possibly translated) text.
        _format_data: Unused — present for dispatcher signature consistency.

    Returns:
        Complete SRT file content.
    """
    parts: list[str] = []
    for seq, entry in enumerate(entries, start=1):
        parts.append(
            f"{seq}\n{entry.start} --> {entry.end}\n{entry.text}\n",
        )
    return "\n".join(parts)

_is_vtt_header_block ¶

_is_vtt_header_block(text)

Returns True if text is a VTT header/meta block (WEBVTT, NOTE, STYLE).

Source code in src/utils/subtitle_utils.py

def _is_vtt_header_block(text: str) -> bool:
    """Returns True if *text* is a VTT header/meta block (WEBVTT, NOTE, STYLE)."""
    return (
        text.startswith("WEBVTT") or text.startswith("NOTE") or text.startswith("STYLE")
    )

parse_vtt ¶

parse_vtt(content)

Parses a WebVTT file into subtitle entries.

Preserves the WEBVTT header, NOTE comments, and STYLE blocks in header so they can be restored during serialization.

PARAMETER	DESCRIPTION
`content`	Raw VTT file content. TYPE: `str`

RETURNS	DESCRIPTION
`list[SubtitleEntry]`	Tuple of (entries, header). header includes everything
`str`	before the first cue (WEBVTT line, NOTEs, STYLEs).

Source code in src/utils/subtitle_utils.py

def parse_vtt(content: str) -> tuple[list[SubtitleEntry], str]:
    """Parses a WebVTT file into subtitle entries.

    Preserves the WEBVTT header, NOTE comments, and STYLE blocks in
    *header* so they can be restored during serialization.

    Args:
        content: Raw VTT file content.

    Returns:
        Tuple of (entries, header).  *header* includes everything
        before the first cue (WEBVTT line, NOTEs, STYLEs).
    """
    content = strip_bom(content)
    # Normalize Windows/Mac line endings to Unix
    content = content.replace("\r\n", "\n").replace("\r", "\n")
    blocks = re.split(r"\n\n+", content.strip())

    header_parts: list[str] = []
    entries: list[SubtitleEntry] = []
    cue_idx = 0

    for block in blocks:
        stripped = block.strip()

        # WEBVTT header, NOTE comments, STYLE blocks → preserve
        if _is_vtt_header_block(stripped):
            header_parts.append(stripped)
            continue

        # Cue block: optional id, timestamp line (with optional settings), text
        lines = stripped.splitlines()
        if not lines:
            continue

        # Determine which line has the timestamp
        ts_line_idx = -1
        for i, line in enumerate(lines):
            if _TIMESTAMP_RE.search(line):
                ts_line_idx = i
                break

        if ts_line_idx < 0:
            # Not a cue — preserve as header material
            header_parts.append(stripped)
            continue

        # Optional cue identifier (line before timestamp)
        cue_id = ""
        if ts_line_idx > 0:
            cue_id = lines[0].strip()

        # Parse timestamp + optional settings
        ts_line = lines[ts_line_idx]
        ts_parts = ts_line.split("-->")
        if len(ts_parts) != 2:  # noqa: PLR2004
            continue

        start = ts_parts[0].strip()
        # End timestamp may be followed by cue settings
        end_and_settings = ts_parts[1].strip()
        # Split: first token is end timestamp, rest are settings
        end_tokens = end_and_settings.split(None, 1)
        end_ts = end_tokens[0] if end_tokens else end_and_settings
        cue_settings = end_tokens[1] if len(end_tokens) > 1 else ""

        text = "\n".join(lines[ts_line_idx + 1 :])
        if not text.strip():
            continue

        meta: dict[str, str] = {}
        if cue_id:
            meta["cue_id"] = cue_id
        if cue_settings:
            meta["cue_settings"] = cue_settings

        entries.append(
            SubtitleEntry(
                index=cue_idx,
                start=start,
                end=end_ts,
                text=text,
                metadata=meta,
            ),
        )
        cue_idx += 1

    header = "\n\n".join(header_parts)
    return entries, header

serialize_vtt ¶

serialize_vtt(entries, header='')

Reconstructs a WebVTT file from entries and the original header.

PARAMETER	DESCRIPTION
`entries`	Subtitle entries with (possibly translated) text. TYPE: `list[SubtitleEntry]`
`header`	Original WEBVTT header block (with NOTEs/STYLEs). TYPE: `str` DEFAULT: `''`

RETURNS	DESCRIPTION
`str`	Complete VTT file content.

Source code in src/utils/subtitle_utils.py

def serialize_vtt(
    entries: list[SubtitleEntry],
    header: str = "",
) -> str:
    """Reconstructs a WebVTT file from entries and the original header.

    Args:
        entries: Subtitle entries with (possibly translated) text.
        header: Original WEBVTT header block (with NOTEs/STYLEs).

    Returns:
        Complete VTT file content.
    """
    parts: list[str] = []
    if header:
        parts.append(header)

    for entry in entries:
        cue_id = entry.metadata.get("cue_id", "")
        cue_settings = entry.metadata.get("cue_settings", "")

        ts_line = f"{entry.start} --> {entry.end}"
        if cue_settings:
            ts_line += f" {cue_settings}"

        if cue_id:
            parts.append(f"{cue_id}\n{ts_line}\n{entry.text}")
        else:
            parts.append(f"{ts_line}\n{entry.text}")

    return "\n\n".join(parts) + "\n"

_strip_ass_tags ¶

_strip_ass_tags(text)

Strips ASS/SSA override tags, preserving visible text.

Tags like {\b1}, {\i1}, {\pos(320,240)} are removed.

PARAMETER	DESCRIPTION
`text`	Raw ASS dialogue text. TYPE: `str`

RETURNS	DESCRIPTION
`str`	Text with override tags removed.

Source code in src/utils/subtitle_utils.py

def _strip_ass_tags(text: str) -> str:
    r"""Strips ASS/SSA override tags, preserving visible text.

    Tags like ``{\b1}``, ``{\i1}``, ``{\pos(320,240)}`` are removed.

    Args:
        text: Raw ASS dialogue text.

    Returns:
        Text with override tags removed.
    """
    return _ASS_OVERRIDE_TAG_RE.sub("", text)

_restore_ass_tags ¶

_restore_ass_tags(original, translated)

Restores leading ASS override tags from original onto translated.

Mid-text tags cannot be reliably repositioned after translation, so only contiguous leading tags are restored.

PARAMETER	DESCRIPTION
`original`	Original text with override tags. TYPE: `str`
`translated`	Translated text without tags. TYPE: `str`

RETURNS	DESCRIPTION
`str`	Translated text prefixed with the original's leading tags.

Source code in src/utils/subtitle_utils.py

def _restore_ass_tags(original: str, translated: str) -> str:
    """Restores leading ASS override tags from *original* onto *translated*.

    Mid-text tags cannot be reliably repositioned after translation, so
    only contiguous leading tags are restored.

    Args:
        original: Original text with override tags.
        translated: Translated text without tags.

    Returns:
        Translated text prefixed with the original's leading tags.
    """
    # Collect all leading override tags
    leading_tags: list[str] = []
    pos = 0
    while pos < len(original):
        m = _ASS_OVERRIDE_TAG_RE.match(original, pos)
        if m:
            leading_tags.append(m.group())
            pos = m.end()
        else:
            break

    if not leading_tags:
        return translated
    return "".join(leading_tags) + translated

parse_ass ¶

parse_ass(content)

Parses an ASS/SSA file into subtitle entries.

Only Dialogue: lines in the [Events] section are treated as translatable. All other content (sections, comments, styles) is preserved verbatim in preserved_lines for later serialization.

PARAMETER	DESCRIPTION
`content`	Raw ASS/SSA file content. TYPE: `str`

RETURNS	DESCRIPTION
`list[SubtitleEntry]`	Tuple of (entries, preserved_lines). Dialogue text positions
`list[str]`	in preserved_lines are replaced with `__SUB_N__` placeholders
`tuple[list[SubtitleEntry], list[str]]`	where N is the entry index.

Source code in src/utils/subtitle_utils.py

def parse_ass(content: str) -> tuple[list[SubtitleEntry], list[str]]:
    """Parses an ASS/SSA file into subtitle entries.

    Only ``Dialogue:`` lines in the ``[Events]`` section are treated as
    translatable.  All other content (sections, comments, styles) is
    preserved verbatim in *preserved_lines* for later serialization.

    Args:
        content: Raw ASS/SSA file content.

    Returns:
        Tuple of (entries, preserved_lines).  Dialogue text positions
        in *preserved_lines* are replaced with ``__SUB_N__`` placeholders
        where *N* is the entry index.
    """
    content = strip_bom(content)
    lines = content.splitlines()
    preserved: list[str] = []
    entries: list[SubtitleEntry] = []
    in_events = False
    format_fields: list[str] = []
    text_field_idx = -1
    entry_idx = 0

    for line in lines:
        stripped = line.strip()

        # Detect section headers
        if stripped.startswith("[") and stripped.endswith("]"):
            in_events = stripped.lower() == "[events]"
            preserved.append(line)
            continue

        # Inside [Events]: look for Format and Dialogue lines
        if in_events:
            if stripped.lower().startswith("format:"):
                # Parse field names to find the Text field position
                fields_part = stripped.split(":", 1)[1]
                format_fields = [f.strip() for f in fields_part.split(",")]
                text_field_idx = next(
                    (i for i, f in enumerate(format_fields) if f.lower() == "text"),
                    -1,
                )
                preserved.append(line)
                continue

            if stripped.startswith("Dialogue:") and text_field_idx >= 0:
                # Split on commas up to (text_field_idx) times to keep
                # commas inside the Text field intact
                after_prefix = stripped.split(":", 1)[1]
                parts = after_prefix.split(",", text_field_idx)

                if len(parts) > text_field_idx:
                    raw_text = parts[text_field_idx].strip()
                    clean_text = _strip_ass_tags(raw_text)

                    # Build the prefix (everything before the text field)
                    prefix = ",".join(parts[:text_field_idx])
                    placeholder = f"__SUB_{entry_idx}__"
                    preserved.append(f"Dialogue:{prefix},{placeholder}")

                    # Parse timestamps from the fixed fields
                    # Standard ASS Format: Layer, Start, End, Style, Name,
                    # MarginL, MarginR, MarginV, Effect, Text
                    field_values = parts[:text_field_idx]
                    start_ts = field_values[1].strip() if len(field_values) > 1 else ""
                    end_ts = (
                        field_values[2].strip()  # noqa: PLR2004
                        if len(field_values) > 2  # noqa: PLR2004
                        else ""
                    )

                    entries.append(
                        SubtitleEntry(
                            index=entry_idx,
                            start=start_ts,
                            end=end_ts,
                            text=clean_text,
                            raw_text=raw_text,
                        ),
                    )
                    entry_idx += 1
                    continue

        # Everything else: preserve verbatim
        preserved.append(line)

    return entries, preserved

serialize_ass ¶

serialize_ass(entries, preserved_lines)

Reconstructs an ASS/SSA file by injecting translated text.

Replaces __SUB_N__ placeholders in preserved_lines with the translated text for each entry, restoring any leading override tags from the original.

PARAMETER	DESCRIPTION
`entries`	Subtitle entries with translated text. TYPE: `list[SubtitleEntry]`
`preserved_lines`	Lines with placeholders from `parse_ass()`. TYPE: `list[str]`

RETURNS	DESCRIPTION
`str`	Complete ASS/SSA file content.

Source code in src/utils/subtitle_utils.py

def serialize_ass(
    entries: list[SubtitleEntry],
    preserved_lines: list[str],
) -> str:
    """Reconstructs an ASS/SSA file by injecting translated text.

    Replaces ``__SUB_N__`` placeholders in *preserved_lines* with the
    translated text for each entry, restoring any leading override tags
    from the original.

    Args:
        entries: Subtitle entries with translated text.
        preserved_lines: Lines with placeholders from ``parse_ass()``.

    Returns:
        Complete ASS/SSA file content.
    """
    # Build a lookup: placeholder → translated text with tags restored
    replacements: dict[str, str] = {}
    for entry in entries:
        placeholder = f"__SUB_{entry.index}__"
        restored = _restore_ass_tags(entry.raw_text, entry.text)
        replacements[placeholder] = restored

    result_lines: list[str] = []
    for line in preserved_lines:
        resolved = line
        for placeholder, text in replacements.items():
            if placeholder in resolved:
                resolved = resolved.replace(placeholder, text)
        result_lines.append(resolved)

    return "\n".join(result_lines) + "\n"

parse_subtitle ¶

parse_subtitle(content, suffix)

Dispatches to the format-specific subtitle parser.

PARAMETER	DESCRIPTION
`content`	Raw file content. TYPE: `str`
`suffix`	Lowercase file extension (e.g. `".srt"`). TYPE: `str`

RETURNS	DESCRIPTION
`list[SubtitleEntry]`	Tuple of (entries, format_data) where format_data is
`object`	whatever the format-specific serializer needs.

RAISES	DESCRIPTION
`ValueError`	If the extension is not a supported subtitle format.

Source code in src/utils/subtitle_utils.py

def parse_subtitle(
    content: str,
    suffix: str,
) -> tuple[list[SubtitleEntry], object]:
    """Dispatches to the format-specific subtitle parser.

    Args:
        content: Raw file content.
        suffix: Lowercase file extension (e.g. ``".srt"``).

    Returns:
        Tuple of (entries, format_data) where *format_data* is
        whatever the format-specific serializer needs.

    Raises:
        ValueError: If the extension is not a supported subtitle format.
    """
    if suffix == ".srt":
        return parse_srt(content)
    if suffix == ".vtt":
        return parse_vtt(content)
    if suffix in (".ass", ".ssa"):
        return parse_ass(content)
    msg = f"Unsupported subtitle format: {suffix}"
    raise ValueError(msg)

serialize_subtitle ¶

serialize_subtitle(entries, format_data, suffix)

Dispatches to the format-specific subtitle serializer.

PARAMETER	DESCRIPTION
`entries`	Subtitle entries with translated text. TYPE: `list[SubtitleEntry]`
`format_data`	Format-specific data from `parse_subtitle()`. TYPE: `object`
`suffix`	Lowercase file extension. TYPE: `str`

RETURNS	DESCRIPTION
`str`	Complete file content.

RAISES	DESCRIPTION
`ValueError`	If the extension is not a supported subtitle format.

Source code in src/utils/subtitle_utils.py

def serialize_subtitle(
    entries: list[SubtitleEntry],
    format_data: object,
    suffix: str,
) -> str:
    """Dispatches to the format-specific subtitle serializer.

    Args:
        entries: Subtitle entries with translated text.
        format_data: Format-specific data from ``parse_subtitle()``.
        suffix: Lowercase file extension.

    Returns:
        Complete file content.

    Raises:
        ValueError: If the extension is not a supported subtitle format.
    """
    if suffix == ".srt":
        return serialize_srt(entries, format_data)
    if suffix == ".vtt":
        return serialize_vtt(entries, format_data)
    if suffix in (".ass", ".ssa"):
        return serialize_ass(entries, format_data)
    msg = f"Unsupported subtitle format: {suffix}"
    raise ValueError(msg)

subtitle_utils¶

subtitle_utils ¶

SubtitleEntry dataclass ¶

mirror_ass_alignment_for_rtl ¶

is_subtitle_format ¶

parse_srt ¶

serialize_srt ¶

_is_vtt_header_block ¶

parse_vtt ¶

serialize_vtt ¶

_strip_ass_tags ¶

_restore_ass_tags ¶

parse_ass ¶

serialize_ass ¶

parse_subtitle ¶

serialize_subtitle ¶

`subtitle_utils`¶

SubtitleEntry `dataclass` ¶