跳轉到

MCP Server (ait-mcp)

mcp_server

MCP server exposing AI Translate capabilities to LLM agents.

Provides text/document translation, image text extraction, audio transcription, speech synthesis, glossary queries, and language listing as MCP tools that any compatible client (Claude Desktop, Claude Code, etc.) can invoke.

Usage::

ait-mcp                     # stdio transport (default)
ait-mcp --transport sse     # SSE transport for web clients

_bootstrap

_bootstrap()

Initializes app directories, logging, and the database once.

Source code in src/mcp_server.py
def _bootstrap() -> None:
    """Initializes app directories, logging, and the database once."""
    global _bootstrapped  # noqa: PLW0603
    if _bootstrapped:
        return

    with _bootstrap_lock:
        if _bootstrapped:
            return

        from src.core.database import init_db  # noqa: PLC0415
        from src.utils.path_manager import (  # noqa: PLC0415
            configure_logging,
            ensure_app_dirs_exist,
        )

        ensure_app_dirs_exist()
        configure_logging()
        init_db()

        # Restore the per-(endpoint, model) variant cache so MCP
        # invocations against a known reasoning model skip the variant
        # probe and go straight to the working payload on first call.
        from src.core.llm_engine import (  # noqa: PLC0415
            _load_persistent_caches,
        )

        _load_persistent_caches()
        _bootstrapped = True

_validate_language

_validate_language(label, param_name)

Validates and resolves a language label case-insensitively.

PARAMETER DESCRIPTION
label

Language name provided by the caller.

TYPE: str

param_name

Parameter name for the error message (e.g. "target language", "source language").

TYPE: str

RETURNS DESCRIPTION
str

The canonical language label from AVAILABLE_LANGUAGES.

RAISES DESCRIPTION
ValueError

If the label does not match any known language.

Source code in src/mcp_server.py
def _validate_language(label: str, param_name: str) -> str:
    """Validates and resolves a language label case-insensitively.

    Args:
        label: Language name provided by the caller.
        param_name: Parameter name for the error message
            (e.g. "target language", "source language").

    Returns:
        The canonical language label from AVAILABLE_LANGUAGES.

    Raises:
        ValueError: If the label does not match any known language.
    """
    resolved = _LANG_LOWER_MAP.get(label.lower())
    if resolved is None:
        msg = (
            f"Unknown {param_name} '{label}'. "
            "Call list_languages to see supported values."
        )
        raise ValueError(msg)
    return resolved

_validate_source_language

_validate_source_language(label)

Validates an optional source language label.

Returns empty string for auto-detect, or the canonical label.

Source code in src/mcp_server.py
def _validate_source_language(label: str) -> str:
    """Validates an optional source language label.

    Returns empty string for auto-detect, or the canonical label.
    """
    if not label:
        return ""
    return _validate_language(label, "source language")

_require_llm

_require_llm()

Raises RuntimeError if the LLM backend is not configured.

Source code in src/mcp_server.py
def _require_llm() -> None:
    """Raises RuntimeError if the LLM backend is not configured."""
    from src.utils.config_manager import check_llm_setup  # noqa: PLC0415

    if not check_llm_setup():
        msg = (
            "LLM is not configured. "
            "Run the desktop app and set up your API key in Settings > LLM."
        )
        raise RuntimeError(msg)

_resolve_content_type

_resolve_content_type(label)

Maps a user-facing content_type string to the internal constant.

Unknown labels fall back to plain text.

Source code in src/mcp_server.py
def _resolve_content_type(label: str) -> str:
    """Maps a user-facing content_type string to the internal constant.

    Unknown labels fall back to plain text.
    """
    if not _CONTENT_TYPE_ALIASES:
        with _content_type_lock:
            if not _CONTENT_TYPE_ALIASES:
                from src.constants.llm import (  # noqa: PLC0415
                    CONTENT_HTML,
                    CONTENT_LOCALIZATION,
                    CONTENT_MARKDOWN,
                    CONTENT_PLAIN_TEXT,
                    CONTENT_RTF,
                    CONTENT_SUBTITLE,
                    CONTENT_XML,
                )

                _CONTENT_TYPE_ALIASES.update(
                    {
                        "plain_text": CONTENT_PLAIN_TEXT,
                        "html": CONTENT_HTML,
                        "subtitle": CONTENT_SUBTITLE,
                        "markdown": CONTENT_MARKDOWN,
                        "xml": CONTENT_XML,
                        "rtf": CONTENT_RTF,
                        "localization": CONTENT_LOCALIZATION,
                        "json": CONTENT_PLAIN_TEXT,
                    }
                )

    return _CONTENT_TYPE_ALIASES.get(label.lower(), _CONTENT_TYPE_ALIASES["plain_text"])

translate_text

translate_text(
    texts,
    target_language,
    source_language="",
    content_type="plain_text",
    model="",
)

Translate a list of text strings into the target language.

PARAMETER DESCRIPTION
texts

One or more strings to translate.

TYPE: list[str]

target_language

Target language name (e.g. "French", "Vietnamese"). Use list_languages to see all supported values.

TYPE: str

source_language

Source language name, or empty string for auto-detection (default).

TYPE: str DEFAULT: ''

content_type

Hint about the text format — one of "plain_text", "html", "subtitle", "markdown", "xml", "rtf", "json", "localization". Helps the LLM preserve formatting tags.

TYPE: str DEFAULT: 'plain_text'

model

LLM model to use (e.g. "Gemini:gemini-3-flash-preview"). Defaults to the last model selected in the desktop app.

TYPE: str DEFAULT: ''

RETURNS DESCRIPTION
list[str]

Translated strings in the same order as the input.

Source code in src/mcp_server.py
@mcp.tool()
def translate_text(  # noqa: PLR0913
    texts: list[str],
    target_language: str,
    source_language: str = "",
    content_type: str = "plain_text",
    model: str = "",
) -> list[str]:
    """Translate a list of text strings into the target language.

    Args:
        texts: One or more strings to translate.
        target_language: Target language name (e.g. "French", "Vietnamese").
            Use list_languages to see all supported values.
        source_language: Source language name, or empty string for
            auto-detection (default).
        content_type: Hint about the text format — one of "plain_text",
            "html", "subtitle", "markdown", "xml", "rtf", "json",
            "localization". Helps the LLM preserve formatting tags.
        model: LLM model to use (e.g. "Gemini:gemini-3-flash-preview").
            Defaults to the last model selected in the desktop app.

    Returns:
        Translated strings in the same order as the input.
    """
    _bootstrap()
    target = _validate_language(target_language, "target language")
    source = _validate_source_language(source_language)
    _require_llm()

    from src.utils.config_manager import (  # noqa: PLC0415
        get_available_models,
        parse_model_id,
    )

    provider: str | None = None
    model_name: str | None = None
    if model:
        # ``parse_model_id`` silently defaults to Gemini when the
        # ``Provider:model`` separator is missing — that's a footgun
        # at the public MCP surface, so reject up front.
        if ":" not in model:
            msg = (
                f"model expects 'Provider:model_name' format (got '{model}'). "
                "Available models can be listed via the desktop app's LLM tab."
            )
            raise ValueError(msg)
        provider, model_name = parse_model_id(model)
        available = get_available_models()
        if (provider, model_name) not in available:
            msg = (
                f"Model '{model}' is not available. "
                f"Available: {', '.join(f'{p}:{m}' for p, m in available)}"
            )
            raise ValueError(msg)

    from src.core.llm_engine import translate_text as _translate  # noqa: PLC0415

    return _translate(
        texts=texts,
        target_lang=target,
        source_lang=source,
        content_type=_resolve_content_type(content_type),
        provider=provider,
        model=model_name,
    )

extract_image_text

extract_image_text(image_path)

Extract text from an image using OCR or the configured LLM vision model.

Tries the LLM vision provider first (Gemini / custom endpoint). Falls back to OCR when LLM isn't configured OR when LLM returns empty/whitespace text. LLM errors (auth, quota, network) propagate as-is rather than silently falling back — otherwise misconfiguration would be invisible to the caller.

PARAMETER DESCRIPTION
image_path

Absolute path to an image file (PNG, JPG, BMP, WEBP, TIFF).

TYPE: str

RETURNS DESCRIPTION
dict[str, Any]

A dict with:

dict[str, Any]
  • "text": the full extracted text as a single string.
dict[str, Any]
  • "method": "llm" or "ocr" indicating which backend was used.
dict[str, Any]
  • "blocks": (OCR only) list of dicts with keys "text", "box" [x, y, w, h], and "confidence".
RAISES DESCRIPTION
RuntimeError

if neither LLM nor OCR is configured.

ValueError

if the image format is unsupported.

FileNotFoundError

if the image path doesn't exist.

Source code in src/mcp_server.py
@mcp.tool()
def extract_image_text(image_path: str) -> dict[str, Any]:
    """Extract text from an image using OCR or the configured LLM vision model.

    Tries the LLM vision provider first (Gemini / custom endpoint). Falls
    back to OCR when LLM isn't configured OR when LLM returns empty/whitespace
    text. LLM errors (auth, quota, network) propagate as-is rather than
    silently falling back — otherwise misconfiguration would be invisible
    to the caller.

    Args:
        image_path: Absolute path to an image file
            (PNG, JPG, BMP, WEBP, TIFF).

    Returns:
        A dict with:
        - "text": the full extracted text as a single string.
        - "method": "llm" or "ocr" indicating which backend was used.
        - "blocks": (OCR only) list of dicts with keys "text", "box"
            [x, y, w, h], and "confidence".

    Raises:
        RuntimeError: if neither LLM nor OCR is configured.
        ValueError: if the image format is unsupported.
        FileNotFoundError: if the image path doesn't exist.
    """
    _bootstrap()

    path = Path(image_path)
    if not path.is_file():
        msg = f"File not found: {image_path}"
        raise FileNotFoundError(msg)

    from src.constants.files import SUPPORTED_IMAGES  # noqa: PLC0415

    if path.suffix.lower() not in SUPPORTED_IMAGES:
        msg = (
            f"Unsupported image format '{path.suffix}'. "
            f"Supported: {', '.join(sorted(SUPPORTED_IMAGES))}"
        )
        raise ValueError(msg)

    # Try LLM vision first
    from src.utils.config_manager import check_llm_setup  # noqa: PLC0415

    if check_llm_setup():
        from src.core.llm_engine import (  # noqa: PLC0415
            extract_image_text as _extract_llm,
        )

        text = _extract_llm(image_path)
        if text.strip():
            return {"text": text, "method": "llm", "blocks": []}

    # Fall back to OCR
    from src.utils.config_manager import check_ocr_setup  # noqa: PLC0415

    if check_ocr_setup():
        from src.constants.ocr import OCR_METHOD_TESSERACT  # noqa: PLC0415
        from src.constants.settings import SETTING_OCR_METHOD  # noqa: PLC0415
        from src.core.ocr_engine import run_ocr  # noqa: PLC0415
        from src.utils.config_manager import load_setting  # noqa: PLC0415

        method = load_setting(SETTING_OCR_METHOD, OCR_METHOD_TESSERACT)
        results = run_ocr(image_path, method=method)
        full_text = " ".join(r.text for r in results if r.text.strip())
        blocks = [
            {"text": r.text, "box": [r.x, r.y, r.w, r.h], "confidence": r.confidence}
            for r in results
        ]
        return {"text": full_text, "method": "ocr", "blocks": blocks}

    msg = (
        "Neither LLM nor OCR is configured. "
        "Run the desktop app and set up an API key in Settings."
    )
    raise RuntimeError(msg)

list_languages

list_languages()

List all supported languages for translation.

RETURNS DESCRIPTION
list[dict[str, str]]

A list of dicts, each with:

list[dict[str, str]]
  • "locale": BCP-47 locale code (e.g. "vi", "zh-CN").
list[dict[str, str]]
  • "name": English language name (e.g. "Vietnamese").
list[dict[str, str]]
  • "native_name": Name in the language's own script.
Source code in src/mcp_server.py
@mcp.tool()
def list_languages() -> list[dict[str, str]]:
    """List all supported languages for translation.

    Returns:
        A list of dicts, each with:
        - "locale": BCP-47 locale code (e.g. "vi", "zh-CN").
        - "name": English language name (e.g. "Vietnamese").
        - "native_name": Name in the language's own script.
    """
    return [
        {"locale": locale, "name": label, "native_name": native}
        for locale, label, _icon, native in LANGUAGES
    ]

_run_pipeline_background

_run_pipeline_background(task_ids, config, cancel_event)

Runs the translation pipeline and cleans up tracking state.

Called as the target of a daemon thread started by translate_document. Catches all exceptions so the thread never crashes silently.

PARAMETER DESCRIPTION
task_ids

Task IDs owned by this pipeline invocation.

TYPE: list[int]

config

TranslationConfig to drive the pipeline.

TYPE: object

cancel_event

Signalled by cancel_task to request cooperative shutdown. Cancellation is checked between tasks and between LLM batches within a task; a mid-batch cancel lets the current batch finish first.

TYPE: Event

Source code in src/mcp_server.py
def _run_pipeline_background(
    task_ids: list[int],
    config: object,
    cancel_event: threading.Event,
) -> None:
    """Runs the translation pipeline and cleans up tracking state.

    Called as the target of a daemon thread started by
    ``translate_document``.  Catches all exceptions so the thread
    never crashes silently.

    Args:
        task_ids: Task IDs owned by this pipeline invocation.
        config: TranslationConfig to drive the pipeline.
        cancel_event: Signalled by ``cancel_task`` to request cooperative
            shutdown.  Cancellation is checked between tasks and between
            LLM batches within a task; a mid-batch cancel lets the current
            batch finish first.
    """
    from src.core.translator import run_translation_pipeline  # noqa: PLC0415

    def _task_cancelled(task_id: int) -> bool:
        """Cancel a specific task when its entry was removed from tracking."""
        with _pipelines_lock:
            return task_id not in _active_pipelines

    try:
        run_translation_pipeline(
            config=config,
            is_cancelled=cancel_event.is_set,
            task_cancelled=_task_cancelled,
            task_ids=task_ids,
        )
    except Exception:
        logger.exception("Pipeline error for tasks %s", task_ids)
    finally:
        with _pipelines_lock:
            for tid in task_ids:
                _active_pipelines.pop(tid, None)

translate_document

translate_document(
    file_paths,
    target_language,
    source_language="",
    output_directory="",
    translate_images=False,
    translate_comments=False,
    translate_shapes=False,
    translate_notes=False,
    translate_sheet_names=False,
    model="",
    ocr_method="",
)

Translate one or more files asynchronously.

Queues translation tasks and starts the pipeline in the background. Use get_task_status to poll for progress and results, and cancel_task to stop a running batch cooperatively.

PARAMETER DESCRIPTION
file_paths

Absolute paths to files to translate. Supported formats include images (.png, .jpg), documents (.docx, .pdf, .pptx), text (.txt, .md, .html, .epub), subtitles (.srt), and localization files (.po, .xliff, .yaml).

TYPE: list[str]

target_language

Target language name (e.g. "French", "Vietnamese"). Use list_languages to see all supported values.

TYPE: str

source_language

Source language name, or empty string for auto-detection (default).

TYPE: str DEFAULT: ''

output_directory

Directory for translated output files. Defaults to the same directory as each source file.

TYPE: str DEFAULT: ''

translate_images

Translate embedded images in Office/PDF documents using OCR (requires OCR to be configured).

TYPE: bool DEFAULT: False

translate_comments

Translate comments in Office documents.

TYPE: bool DEFAULT: False

translate_shapes

Translate shapes and text boxes in documents.

TYPE: bool DEFAULT: False

translate_notes

Translate speaker notes in PowerPoint files.

TYPE: bool DEFAULT: False

translate_sheet_names

Translate sheet names in Excel files.

TYPE: bool DEFAULT: False

model

LLM model to use (e.g. "Gemini:gemini-3-flash-preview"). Defaults to the last model selected in the desktop app.

TYPE: str DEFAULT: ''

ocr_method

OCR engine for translate_images. One of "TesseractOCR" (default), "EasyOCR", or "Google Cloud OCR". Friendly spellings like "tesseract" / "easyocr" / "google cloud" are accepted.

TYPE: str DEFAULT: ''

RETURNS DESCRIPTION
dict[str, Any]

A dict with:

dict[str, Any]
  • "task_ids": list of integer task IDs for polling via get_task_status.
dict[str, Any]
  • "file_count": number of files queued.
Source code in src/mcp_server.py
@mcp.tool()
def translate_document(  # noqa: PLR0913, PLR0915
    file_paths: list[str],
    target_language: str,
    source_language: str = "",
    output_directory: str = "",
    translate_images: bool = False,
    translate_comments: bool = False,
    translate_shapes: bool = False,
    translate_notes: bool = False,
    translate_sheet_names: bool = False,
    model: str = "",
    ocr_method: str = "",
) -> dict[str, Any]:
    """Translate one or more files asynchronously.

    Queues translation tasks and starts the pipeline in the background.
    Use get_task_status to poll for progress and results, and cancel_task
    to stop a running batch cooperatively.

    Args:
        file_paths: Absolute paths to files to translate. Supported
            formats include images (.png, .jpg), documents (.docx, .pdf,
            .pptx), text (.txt, .md, .html, .epub), subtitles (.srt),
            and localization files (.po, .xliff, .yaml).
        target_language: Target language name (e.g. "French", "Vietnamese").
            Use list_languages to see all supported values.
        source_language: Source language name, or empty string for
            auto-detection (default).
        output_directory: Directory for translated output files. Defaults
            to the same directory as each source file.
        translate_images: Translate embedded images in Office/PDF documents
            using OCR (requires OCR to be configured).
        translate_comments: Translate comments in Office documents.
        translate_shapes: Translate shapes and text boxes in documents.
        translate_notes: Translate speaker notes in PowerPoint files.
        translate_sheet_names: Translate sheet names in Excel files.
        model: LLM model to use (e.g. "Gemini:gemini-3-flash-preview").
            Defaults to the last model selected in the desktop app.
        ocr_method: OCR engine for translate_images. One of "TesseractOCR"
            (default), "EasyOCR", or "Google Cloud OCR". Friendly spellings
            like "tesseract" / "easyocr" / "google cloud" are accepted.

    Returns:
        A dict with:
        - "task_ids": list of integer task IDs for polling via get_task_status.
        - "file_count": number of files queued.
    """
    _bootstrap()
    target = _validate_language(target_language, "target language")
    source = _validate_source_language(source_language)
    _require_llm()

    # Validate files
    from src.constants.files import ALL_SUPPORTED_EXTENSIONS  # noqa: PLC0415

    valid_paths: list[str] = []
    for fp in file_paths:
        p = Path(fp).resolve()
        if not p.is_file():
            msg = f"File not found: {fp}"
            raise FileNotFoundError(msg)
        if p.suffix.lower() not in ALL_SUPPORTED_EXTENSIONS:
            msg = f"Unsupported file format '{p.suffix}': {fp}"
            raise ValueError(msg)
        valid_paths.append(str(p))

    # Resolve the output directory once and reuse — the pipeline writes
    # relative to storage_path, so if we only resolved for mkdir() the
    # caller's relative path would later be interpreted against whatever
    # cwd the pipeline thread happens to inherit.
    resolved_output = ""
    if output_directory:
        resolved = Path(output_directory).resolve()
        resolved.mkdir(parents=True, exist_ok=True)
        resolved_output = str(resolved)

    # Build config
    from src.constants.ocr import (  # noqa: PLC0415
        OCR_METHOD_TESSERACT,
        OCR_METHODS,
        resolve_ocr_method,
    )
    from src.core.config import TranslationConfig  # noqa: PLC0415
    from src.utils.config_manager import (  # noqa: PLC0415
        check_ocr_setup_for_method,
        get_available_models,
        parse_model_id,
    )

    llm_provider, llm_model = "", ""
    if model:
        # ``parse_model_id`` silently defaults to Gemini when the
        # ``Provider:model`` separator is missing — guard at the public
        # tool boundary so callers don't accidentally invoke a backend
        # they didn't intend.
        if ":" not in model:
            msg = (
                f"model expects 'Provider:model_name' format (got '{model}'). "
                "Available models can be listed via the desktop app's LLM tab."
            )
            raise ValueError(msg)
        llm_provider, llm_model = parse_model_id(model)
        available = get_available_models()
        if (llm_provider, llm_model) not in available:
            msg = (
                f"Model '{model}' is not available. "
                f"Available: {', '.join(f'{p}:{m}' for p, m in available)}"
            )
            raise ValueError(msg)

    resolved_ocr = OCR_METHOD_TESSERACT
    if ocr_method:
        resolved = resolve_ocr_method(ocr_method)
        if resolved is None:
            msg = (
                f"Unknown OCR method '{ocr_method}'. "
                f"Available: {', '.join(OCR_METHODS)}"
            )
            raise ValueError(msg)
        resolved_ocr = resolved

    config = TranslationConfig(
        storage_path=resolved_output,
        ocr_method=resolved_ocr,
        translate_doc_images=translate_images,
        translate_doc_comments=translate_comments,
        translate_doc_shapes=translate_shapes,
        translate_doc_notes=translate_notes,
        translate_sheet_names=translate_sheet_names,
        ocr_is_configured=(
            check_ocr_setup_for_method(resolved_ocr) if translate_images else False
        ),
        auto_remove_history=False,
        llm_provider=llm_provider,
        llm_model=llm_model,
    )

    # Clone files and create DB entries
    from src.core.translator import setup_translation_tasks  # noqa: PLC0415

    tasks = setup_translation_tasks(valid_paths, source, target)
    if not tasks:
        msg = "Failed to set up translation tasks."
        raise RuntimeError(msg)

    task_ids = [t[0] for t in tasks]

    # Run pipeline in a daemon thread with a per-batch cancel event so
    # cancel_task() can signal only this invocation's pipeline.
    cancel_event = threading.Event()
    thread = threading.Thread(
        target=_run_pipeline_background,
        args=(task_ids, config, cancel_event),
        daemon=True,
    )
    with _pipelines_lock:
        for tid in task_ids:
            _active_pipelines[tid] = (thread, cancel_event)
    thread.start()

    return {"task_ids": task_ids, "file_count": len(task_ids)}

get_task_status

get_task_status(task_ids)

Get the current status and progress of translation tasks.

Use this to poll tasks created by translate_document.

PARAMETER DESCRIPTION
task_ids

List of task IDs returned by translate_document.

TYPE: list[int]

RETURNS DESCRIPTION
list[dict[str, Any]]

A list of dicts (one per task ID), each with:

list[dict[str, Any]]
  • "task_id": the integer task ID.
list[dict[str, Any]]
  • "status": one of "Pending", "Translating", "Done", "Failed", "Paused", or null if the entry was auto-removed.
list[dict[str, Any]]
  • "progress": integer 0-100.
list[dict[str, Any]]
  • "file_name": original file name.
list[dict[str, Any]]
  • "source_lang": source language.
list[dict[str, Any]]
  • "target_lang": target language.
list[dict[str, Any]]
  • "error_code": integer error code (0 = no error), or null.
list[dict[str, Any]]
  • "error_message": raw error tag string preserving any :Service suffix (e.g. "AUTH_ERROR:Gemini"), or null when the task hasn't failed. The suffix names the backend whose API key needs attention. Localised human-readable text is the client's responsibility — the raw tag is exposed so service-aware client UIs can extract the suffix or display the tag verbatim.
Source code in src/mcp_server.py
@mcp.tool()
def get_task_status(task_ids: list[int]) -> list[dict[str, Any]]:
    """Get the current status and progress of translation tasks.

    Use this to poll tasks created by translate_document.

    Args:
        task_ids: List of task IDs returned by translate_document.

    Returns:
        A list of dicts (one per task ID), each with:
        - "task_id": the integer task ID.
        - "status": one of "Pending", "Translating", "Done", "Failed",
            "Paused", or null if the entry was auto-removed.
        - "progress": integer 0-100.
        - "file_name": original file name.
        - "source_lang": source language.
        - "target_lang": target language.
        - "error_code": integer error code (0 = no error), or null.
        - "error_message": raw error tag string preserving any
            ``:Service`` suffix (e.g. ``"AUTH_ERROR:Gemini"``),
            or null when the task hasn't failed.  The suffix names
            the backend whose API key needs attention.  Localised
            human-readable text is the client's responsibility —
            the raw tag is exposed so service-aware client UIs can
            extract the suffix or display the tag verbatim.
    """
    _bootstrap()

    from src.core.database import get_history_entry_details  # noqa: PLC0415

    # Single batch query for all requested IDs — the previous
    # per-id loop was a textbook N+1 (100 task_ids → 100 SELECTs).
    # Missing IDs come back absent from the map; we synthesise the
    # "auto-removed" sentinel record for them below.
    details = get_history_entry_details(task_ids)

    results: list[dict[str, Any]] = []
    for tid in task_ids:
        detail = details.get(tid)
        if detail is None:
            results.append(
                {
                    "task_id": tid,
                    "status": None,
                    "progress": 100,
                    "file_name": None,
                    "source_lang": None,
                    "target_lang": None,
                    "error_code": None,
                    "error_message": None,
                }
            )
        else:
            results.append(
                {
                    "task_id": detail["id"],
                    "status": detail["status"],
                    "progress": detail["progress"],
                    "file_name": detail["file_name"],
                    "source_lang": detail["source_lang"],
                    "target_lang": detail["target_lang"],
                    "error_code": detail["error_code"],
                    "error_message": detail.get("error_message"),
                }
            )
    return results

cancel_task

cancel_task(task_ids)

Request cancellation of translation tasks started by translate_document.

Cancellation is cooperative: the pipeline checks the flag between tasks and between LLM batches, so an in-flight LLM call completes before the pipeline exits. Unknown task IDs are ignored — no error is raised so callers can safely over-request.

PARAMETER DESCRIPTION
task_ids

Task IDs returned by translate_document.

TYPE: list[int]

RETURNS DESCRIPTION
dict[str, Any]

A dict with:

dict[str, Any]
  • "cancelled": list of task IDs for which a cancel was signalled.
dict[str, Any]
  • "unknown": list of task IDs that were not active (already finished, already cancelled, or never queued here).
Source code in src/mcp_server.py
@mcp.tool()
def cancel_task(task_ids: list[int]) -> dict[str, Any]:
    """Request cancellation of translation tasks started by translate_document.

    Cancellation is cooperative: the pipeline checks the flag between tasks
    and between LLM batches, so an in-flight LLM call completes before the
    pipeline exits. Unknown task IDs are ignored — no error is raised so
    callers can safely over-request.

    Args:
        task_ids: Task IDs returned by translate_document.

    Returns:
        A dict with:
        - "cancelled": list of task IDs for which a cancel was signalled.
        - "unknown": list of task IDs that were not active (already finished,
            already cancelled, or never queued here).
    """
    _bootstrap()

    cancelled: list[int] = []
    unknown: list[int] = []
    seen_events: set[int] = set()
    ids_to_pause: set[int] = set()
    with _pipelines_lock:
        for tid in task_ids:
            entry = _active_pipelines.get(tid)
            if entry is None:
                unknown.append(tid)
                continue
            _thread, event = entry
            # Dedupe: multiple task IDs can share one pipeline thread; only
            # call set() once per event for cleanliness.
            if id(event) not in seen_events:
                event.set()
                seen_events.add(id(event))
                ids_to_pause.update(
                    active_tid
                    for active_tid, (_active_thread, active_event) in (
                        _active_pipelines.items()
                    )
                    if active_event is event
                )
            cancelled.append(tid)
    if ids_to_pause:
        from src.core.database import batch_pause_history_entries  # noqa: PLC0415

        batch_pause_history_entries(sorted(ids_to_pause))
    return {"cancelled": cancelled, "unknown": unknown}

transcribe_audio

transcribe_audio(
    file_path, source_language="", stt_method="Whisper", model_size="base"
)

Transcribe an audio or video file to SRT subtitle text.

PARAMETER DESCRIPTION
file_path

Absolute path to an audio or video file. Audio: .mp3, .wav, .m4a, .flac, .ogg, .aac, .wma. Video: .mp4, .webm, .mkv, .avi, .mov, .wmv.

TYPE: str

source_language

Source language name (e.g. "French"), or empty string for auto-detection (default).

TYPE: str DEFAULT: ''

stt_method

Speech-to-text engine — "Whisper" (local, default) or "Google Cloud".

TYPE: str DEFAULT: 'Whisper'

model_size

Whisper model size — "tiny", "base" (default), "small", "medium", or "large". Ignored for Google Cloud.

TYPE: str DEFAULT: 'base'

RETURNS DESCRIPTION
dict[str, str]

A dict with:

dict[str, str]
  • "srt": the generated subtitle text in SRT format.
dict[str, str]
  • "method": the STT engine used.
Source code in src/mcp_server.py
@mcp.tool()
def transcribe_audio(
    file_path: str,
    source_language: str = "",
    stt_method: str = "Whisper",
    model_size: str = "base",
) -> dict[str, str]:
    """Transcribe an audio or video file to SRT subtitle text.

    Args:
        file_path: Absolute path to an audio or video file.
            Audio: .mp3, .wav, .m4a, .flac, .ogg, .aac, .wma.
            Video: .mp4, .webm, .mkv, .avi, .mov, .wmv.
        source_language: Source language name (e.g. "French"), or empty
            string for auto-detection (default).
        stt_method: Speech-to-text engine — "Whisper" (local, default)
            or "Google Cloud".
        model_size: Whisper model size — "tiny", "base" (default),
            "small", "medium", or "large". Ignored for Google Cloud.

    Returns:
        A dict with:
        - "srt": the generated subtitle text in SRT format.
        - "method": the STT engine used.
    """
    _bootstrap()

    path = Path(file_path).resolve()
    if not path.is_file():
        msg = f"File not found: {file_path}"
        raise FileNotFoundError(msg)

    from src.constants.files import SUPPORTED_MEDIA  # noqa: PLC0415

    if path.suffix.lower() not in SUPPORTED_MEDIA:
        msg = (
            f"Unsupported audio/video format '{path.suffix}'. "
            f"Supported: {', '.join(sorted(SUPPORTED_MEDIA))}"
        )
        raise ValueError(msg)

    source = _validate_source_language(source_language)

    from src.constants.settings import STT_GOOGLE, STT_WHISPER  # noqa: PLC0415

    method_map = {"whisper": STT_WHISPER, "google cloud": STT_GOOGLE}
    resolved_method = method_map.get(stt_method.lower())
    if resolved_method is None:
        msg = f"Unknown STT method '{stt_method}'. Use 'Whisper' or 'Google Cloud'."
        raise ValueError(msg)

    from src.core.speech_engine import transcribe_audio as _transcribe  # noqa: PLC0415

    try:
        srt_text = _transcribe(
            str(path),
            src_lang=source,
            stt_method=resolved_method,
            model_size=model_size,
        )
    except RuntimeError as exc:
        # Re-wrap the bare engine tag into a human-readable message so MCP
        # callers don't need to know about FFMPEG_NOT_FOUND / similar
        # internal sentinels.
        if "FFMPEG_NOT_FOUND" in str(exc):
            msg = (
                "FFmpeg is required to decode this audio/video file but is "
                "not installed or not on PATH. Install FFmpeg and try again."
            )
            raise RuntimeError(msg) from exc
        raise
    return {"srt": srt_text, "method": stt_method}

synthesize_speech

synthesize_speech(
    text,
    target_language,
    output_path="",
    voice_gender="FEMALE",
    tts_method="Edge TTS",
    audio_format=".mp3",
)

Convert text to speech audio.

PARAMETER DESCRIPTION
text

The text to synthesize into speech.

TYPE: str

target_language

Language for the voice (e.g. "French", "Vietnamese"). Use list_languages to see all supported values.

TYPE: str

output_path

Absolute path for the output audio file. If empty, a temp file is created under the system temp directory with a tts_ prefix and the caller is responsible for deleting it.

TYPE: str DEFAULT: ''

voice_gender

Voice gender — "MALE" or "FEMALE" (default).

TYPE: str DEFAULT: 'FEMALE'

tts_method

TTS engine — "Edge TTS" (free, default), "Google Cloud TTS", "ElevenLabs", "Gemini TTS", or "Piper TTS" (offline; requires the per-language voice to be downloaded first via the desktop app's Settings → Voice → Piper panel, otherwise raises PIPER_VOICE_NOT_INSTALLED).

TYPE: str DEFAULT: 'Edge TTS'

audio_format

Output format — ".mp3" (default) or ".wav". The leading dot is optional; any other value raises ValueError.

TYPE: str DEFAULT: '.mp3'

RETURNS DESCRIPTION
dict[str, str]

A dict with:

dict[str, str]
  • "output_path": absolute path to the generated audio file.
dict[str, str]
  • "method": the TTS engine used.
Source code in src/mcp_server.py
@mcp.tool()
def synthesize_speech(  # noqa: PLR0913
    text: str,
    target_language: str,
    output_path: str = "",
    voice_gender: str = "FEMALE",
    tts_method: str = "Edge TTS",
    audio_format: str = ".mp3",
) -> dict[str, str]:
    """Convert text to speech audio.

    Args:
        text: The text to synthesize into speech.
        target_language: Language for the voice (e.g. "French", "Vietnamese").
            Use list_languages to see all supported values.
        output_path: Absolute path for the output audio file. If empty, a
            temp file is created under the system temp directory with a
            ``tts_`` prefix and the caller is responsible for deleting it.
        voice_gender: Voice gender — "MALE" or "FEMALE" (default).
        tts_method: TTS engine — "Edge TTS" (free, default),
            "Google Cloud TTS", "ElevenLabs", "Gemini TTS", or
            "Piper TTS" (offline; requires the per-language voice to
            be downloaded first via the desktop app's Settings →
            Voice → Piper panel, otherwise raises
            PIPER_VOICE_NOT_INSTALLED).
        audio_format: Output format — ".mp3" (default) or ".wav". The
            leading dot is optional; any other value raises ValueError.

    Returns:
        A dict with:
        - "output_path": absolute path to the generated audio file.
        - "method": the TTS engine used.
    """
    _bootstrap()

    if not text.strip():
        msg = "Text cannot be empty."
        raise ValueError(msg)

    target = _validate_language(target_language, "target language")

    # Normalise audio_format up front so the suffix used for the temp file
    # and the value sent to the backend agree.  Strip whitespace and add
    # the leading dot if the caller omitted it (e.g. ``"mp3"`` vs
    # ``".mp3"``); compare lowercased so ``"MP3"`` is accepted too.
    cleaned_format = audio_format.strip()
    normalized_format = (
        cleaned_format if cleaned_format.startswith(".") else f".{cleaned_format}"
    ).lower()
    _supported_formats = (".mp3", ".wav")
    if normalized_format not in _supported_formats:
        msg = (
            f"Unsupported audio_format '{audio_format}'. "
            f"Supported: {', '.join(_supported_formats)}"
        )
        raise ValueError(msg)

    from src.constants.settings import (  # noqa: PLC0415
        VOICE_TTS_EDGE,
        VOICE_TTS_ELEVENLABS,
        VOICE_TTS_GEMINI,
        VOICE_TTS_GOOGLE,
        VOICE_TTS_PIPER,
    )

    method_map = {
        "edge tts": VOICE_TTS_EDGE,
        "elevenlabs": VOICE_TTS_ELEVENLABS,
        "google cloud tts": VOICE_TTS_GOOGLE,
        "gemini tts": VOICE_TTS_GEMINI,
        "piper tts": VOICE_TTS_PIPER,
    }
    resolved_method = method_map.get(tts_method.lower())
    if resolved_method is None:
        msg = (
            f"Unknown TTS method '{tts_method}'. "
            "Use 'Edge TTS', 'Google Cloud TTS', 'ElevenLabs', "
            "'Gemini TTS', or 'Piper TTS'."
        )
        raise ValueError(msg)

    # Generate a temp file when no output path is specified. Caller is
    # responsible for cleanup in that case.
    if not output_path:
        import tempfile  # noqa: PLC0415

        with tempfile.NamedTemporaryFile(
            suffix=normalized_format, delete=False, prefix="tts_"
        ) as tmp:
            output_path = tmp.name

    from src.core.speech_engine import synthesize_speech as _synthesize  # noqa: PLC0415

    result_path = _synthesize(
        text=text,
        target_lang=target,
        voice_gender=voice_gender.upper(),
        output_path=output_path,
        tts_method=resolved_method,
        audio_format=normalized_format,
    )
    return {"output_path": result_path, "method": tts_method}

query_glossary

query_glossary(set_id=None, active_only=True)

Query glossary sets and their translation term pairs.

Glossaries enforce consistent terminology during translation. When no set_id is given, returns all matching glossary sets with their entry counts. When set_id is given, returns that set's entries.

PARAMETER DESCRIPTION
set_id

If provided, return entries for this specific glossary set. If omitted, return all glossary sets.

TYPE: int | None DEFAULT: None

active_only

When listing sets (no set_id), only return active sets if True (default). Ignored when set_id is provided.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
dict[str, Any]

A dict with either:

dict[str, Any]
  • "sets": list of {"id", "name", "is_active", "entry_count"} (when no set_id)
dict[str, Any]
  • "entries": list of {"id", "source", "target"} (when set_id given)
dict[str, Any]
  • "set_id": the queried set ID (when set_id given)
Source code in src/mcp_server.py
@mcp.tool()
def query_glossary(
    set_id: int | None = None,
    active_only: bool = True,
) -> dict[str, Any]:
    """Query glossary sets and their translation term pairs.

    Glossaries enforce consistent terminology during translation.
    When no set_id is given, returns all matching glossary sets with
    their entry counts. When set_id is given, returns that set's entries.

    Args:
        set_id: If provided, return entries for this specific glossary
            set. If omitted, return all glossary sets.
        active_only: When listing sets (no set_id), only return active
            sets if True (default). Ignored when set_id is provided.

    Returns:
        A dict with either:
        - "sets": list of {"id", "name", "is_active", "entry_count"} (when no set_id)
        - "entries": list of {"id", "source", "target"} (when set_id given)
        - "set_id": the queried set ID (when set_id given)
    """
    _bootstrap()

    from src.core.database import (  # noqa: PLC0415
        get_active_glossary_sets,
        get_glossary_entries,
        get_glossary_entry_count,
        get_glossary_sets,
    )

    if set_id is not None:
        entries = get_glossary_entries(set_id)
        return {
            "set_id": set_id,
            "entries": [
                {"id": eid, "source": src, "target": tgt} for eid, src, tgt in entries
            ],
        }

    if active_only:
        raw_sets = get_active_glossary_sets()
        sets = [
            {
                "id": sid,
                "name": name,
                "is_active": True,
                "entry_count": get_glossary_entry_count(sid),
            }
            for sid, name in raw_sets
        ]
    else:
        raw_sets = get_glossary_sets()
        sets = [
            {
                "id": sid,
                "name": name,
                "is_active": bool(active),
                "entry_count": get_glossary_entry_count(sid),
            }
            for sid, name, active in raw_sets
        ]

    return {"sets": sets}

main

main()

CLI entry point for the MCP server.

Source code in src/mcp_server.py
def main() -> None:
    """CLI entry point for the MCP server."""
    import argparse  # noqa: PLC0415

    parser = argparse.ArgumentParser(
        prog="ait-mcp",
        description="AI Translate MCP server.",
    )
    parser.add_argument(
        "--transport",
        choices=["stdio", "sse"],
        default="stdio",
        help="MCP transport (default: stdio).",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=8000,
        help="Port for SSE transport (default: 8000).",
    )
    args = parser.parse_args()

    if args.transport == "sse":
        mcp.run(transport="sse", port=args.port)
    else:
        mcp.run(transport="stdio")