aurak/libreoffice-server/main.py

import io
import os
import subprocess
import time
from typing import Optional

from fastapi import FastAPI, File, HTTPException, UploadFile
from fastapi.responses import FileResponse, RedirectResponse
from PIL import Image  # Pillow library for image processing
from pydantic import BaseModel

# Response models
class ConvertResponse(BaseModel):
    pdf_path: str
    converted: bool
    original: Optional[str] = None
    file_size: Optional[int] = None
    error: Optional[str] = None

class HealthResponse(BaseModel):
    status: str
    service: str
    version: str
    uptime: float

# FastAPI Application
app = FastAPI(
    title="LibreOffice Document Conversion Service",
    description="Convert Word/PPT/Excel/PDF to PDF and support mixed content document processing",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc"
)

start_time = time.time()

@app.get("/", include_in_schema=False)
async def root():
    """Redirect to documentation page"""
    return RedirectResponse(url="/docs")

@app.get("/health", response_model=HealthResponse)
async def health():
    """Health check interface"""
    return HealthResponse(
        status="healthy",
        service="libreoffice-converter",
        version="1.0.0",
        uptime=time.time() - start_time
    )

@app.post("/convert")
async def convert(file: UploadFile = File(...)):
    """
    Document conversion interface
    Returns: PDF file stream
    """
    try:
        # File format validation
        allowed_extensions = [
            '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx',
            '.md', '.txt', '.rtf', '.odt', '.ods', '.odp',
            '.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'
        ]
        file_ext = os.path.splitext(file.filename)[1].lower()

        if file_ext not in allowed_extensions:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported file format: {file_ext}. Supported formats: {', '.join(allowed_extensions)}"
            )

        # Check uploads directory existence
        upload_dir = "/app/uploads" if os.path.exists("/app/uploads") else "./uploads"
        os.makedirs(upload_dir, exist_ok=True)

        # Save uploaded file
        filepath = os.path.join(upload_dir, file.filename)
        with open(filepath, "wb") as buffer:
            content = await file.read()
            buffer.write(content)

        # For PDF files, return directly without conversion
        if file_ext == '.pdf':
            return FileResponse(filepath, filename=file.filename, media_type='application/pdf')

        if file_ext == '.md':
            # Use Node.js script to render Markdown to PDF
            expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
            cmd = [
                'node',
                '/app/md_to_pdf.js',
                filepath,
                expected_pdf
            ]
        elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp']:
            # For image files, use Pillow to convert to PDF
            expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'

            # Open image and save as PDF
            with Image.open(filepath) as img:
                # Convert RGBA mode to RGB (support for transparent images)
                if img.mode in ('RGBA', 'LA', 'P'):
                    # Convert to white background
                    background = Image.new('RGB', img.size, (255, 255, 255))
                    if img.mode == 'P':
                        img = img.convert('RGBA')
                    background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
                    img = background
                elif img.mode != 'RGB':
                    img = img.convert('RGB')

                # Save as PDF
                img.save(expected_pdf, 'PDF', resolution=100.0, save_all=False)

            # Verify PDF generation completed
            if not os.path.exists(expected_pdf):
                raise HTTPException(
                    status_code=500,
                    detail="Image to PDF conversion succeeded but output file not found"
                )

            # Image conversion completed, return PDF file
            filename_base = os.path.splitext(file.filename)[0]
            return FileResponse(expected_pdf, filename=f"{filename_base}.pdf", media_type='application/pdf')
        else:
            # Conversion using LibreOffice
            cmd = [
                'soffice',
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', upload_dir,
                filepath
            ]

        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=600,  # Extended to 10 minutes to support complex Markdown conversion
        )

        # Combine stdout and stderr for error reporting since capture_output uses PIPE
        combined_output = result.stdout if result.stdout else ""
        if result.stderr:
            combined_output += "\n" + result.stderr

        # Display Node.js script output for debugging
        print(f"Node.js script output: {combined_output}")

        if result.returncode != 0:
            print(f"Subprocess failed with return code: {result.returncode}")

            # Combine stdout and stderr for error reporting
            combined_output = result.stdout if result.stdout else ""
            if result.stderr:
                combined_output += "\n" + result.stderr

            print(f"Subprocess output: {combined_output}")
            raise HTTPException(
                status_code=500,
                detail=f"Conversion failed: {combined_output}"
            )

        # Verify output file
        expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
        if not os.path.exists(expected_pdf):
            raise HTTPException(
                status_code=500,
                detail="Conversion succeeded but output file not found"
            )

        filename_base = os.path.splitext(file.filename)[0]
        return FileResponse(expected_pdf, filename=f"{filename_base}.pdf", media_type='application/pdf')

    except HTTPException:
        raise
    except subprocess.TimeoutExpired:
        raise HTTPException(status_code=504, detail="Conversion timeout (300 seconds)")
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/version")
async def version():
    """Version information"""
    return {
        "service": "libreoffice-converter",
        "version": "1.0.0",
        "framework": "FastAPI",
        "libreoffice": "7.x"
    }