// src/extractors/PDFExtractor.ts
import pdfjsLib from '../utils/pdfInit';
import { BaseDocumentExtractor, ExtractionOptions, ExtractionResult } from './BaseDocumentExtractor';
import { PDFDocumentProxy } from 'pdfjs-dist';

// Define proper interfaces for PDF.js types
interface PDFTextContent {
    items: PDFTextItem[];
    styles?: any;
}

interface PDFTextItem {
    str: string;
    dir: string;
    width: number;
    height: number;
    transform: number[];
    fontName: string;
    hasEOL: boolean;
}

interface PDFMetadata {
    info: {
        Title?: string;
        Author?: string;
        Subject?: string;
        Keywords?: string;
    };
    metadata?: any;
}

export interface PDFExtractionOptions extends ExtractionOptions {
    detectHeaders?: boolean;
    detectLists?: boolean;
    detectTables?: boolean;
    minTableRows?: number;
}

interface TextBlock {
    items: PDFTextItem[];
    text: string;
    fontSize: number;
    y: number;
}

export class PDFExtractor extends BaseDocumentExtractor {
    private pdfDocument: PDFDocumentProxy | null = null;
    private readonly options: PDFExtractionOptions;

    constructor(file: File, options: PDFExtractionOptions = {}) {
        super(file);
        this.options = {
            detectHeaders: true,
            detectLists: true,
            detectTables: true,
            minTableRows: 2,
            ...options
        };
    }

    protected async validateFileType(): Promise<void> {
        if (this.file.type !== 'application/pdf' && !this.file.name.toLowerCase().endsWith('.pdf')) {
            throw new Error('Invalid file type. Expected PDF.');
        }
    }

    protected async preProcess(): Promise<void> {
        try {
            const arrayBuffer = await this.file.arrayBuffer();
            this.pdfDocument = await pdfjsLib.getDocument({
                data: arrayBuffer,
                useWorkerFetch: true,
                isEvalSupported: true,
                useSystemFonts: true
            }).promise;
        } catch (error) {
            throw new Error(`Failed to initialize PDF: ${error instanceof Error ? error.message : 'Unknown error'}`);
        }
    }

    protected async extractContent(): Promise<string> {
        if (!this.pdfDocument) {
            throw new Error('PDF document not initialized');
        }

        try {
            let html = '';
            
            for (let pageNum = 1; pageNum <= this.pdfDocument.numPages; pageNum++) {
                const page = await this.pdfDocument.getPage(pageNum);
                const textContent = await (page as unknown as { getTextContent(): Promise<PDFTextContent> }).getTextContent();
                html += await this.processPage(textContent.items);
                await page.cleanup();
            }

            return this.postProcessHTML(html);
        } catch (error) {
            throw new Error(`PDF extraction failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
        }
    }

    private async processPage(items: PDFTextItem[]): Promise<string> {
        const blocks = this.groupItemsIntoBlocks(items);
        let html = '';
        let tableData: string[][] = [];
        let inTable = false;

        for (const block of blocks) {
            if (!block.text.trim()) continue;

            if (this.isTableRow(block)) {
                inTable = true;
                tableData.push(this.processTableRow(block));
            } else {
                if (inTable) {
                    html += this.createTableHTML(tableData);
                    tableData = [];
                    inTable = false;
                }
                html += this.processTextBlock(block);
            }
        }

        if (inTable && tableData.length > 0) {
            html += this.createTableHTML(tableData);
        }

        return html;
    }

    private groupItemsIntoBlocks(items: PDFTextItem[]): TextBlock[] {
        items.sort((a, b) => (b.transform[5] - a.transform[5]));

        const blocks: TextBlock[] = [];
        let currentBlock: TextBlock | null = null;

        for (const item of items) {
            if (!currentBlock) {
                currentBlock = {
                    items: [item],
                    text: item.str,
                    fontSize: item.height,
                    y: item.transform[5]
                };
                continue;
            }

            const yDiff = Math.abs(currentBlock.y - item.transform[5]);
            if (yDiff < item.height) {
                currentBlock.items.push(item);
                currentBlock.text += ' ' + item.str;
            } else {
                blocks.push(currentBlock);
                currentBlock = {
                    items: [item],
                    text: item.str,
                    fontSize: item.height,
                    y: item.transform[5]
                };
            }
        }

        if (currentBlock) {
            blocks.push(currentBlock);
        }

        return blocks;
    }

    private processTextBlock(block: TextBlock): string {
        const text = block.text.trim();
        
        if (this.options.detectHeaders && this.isHeader(block)) {
            const level = this.getHeaderLevel(block);
            return `<h${level}>${this.escapeHtml(text)}</h${level}>\n`;
        }

        if (this.options.detectLists && this.isListItem(text)) {
            return this.createListItem(text);
        }

        return `<p>${this.escapeHtml(text)}</p>\n`;
    }

    private isHeader(block: TextBlock): boolean {
        const avgFontSize = block.fontSize;
        const text = block.text.trim();

        return (
            avgFontSize > 12 &&
            text.length < 200 &&
            !text.endsWith('.') &&
            !/^\d+\./.test(text)
        );
    }

    private getHeaderLevel(block: TextBlock): number {
        const fontSize = block.fontSize;
        if (fontSize > 20) return 1;
        if (fontSize > 16) return 2;
        return 3;
    }

    private isListItem(text: string): boolean {
        return (
            /^[•·⚫○◦-]\s+/.test(text) ||
            /^\d+\.\s+/.test(text) ||
            /^[a-z]\)\s+/.test(text)
        );
    }

    private createListItem(text: string): string {
        const isOrdered = /^\d+\./.test(text);
        const cleanText = this.escapeHtml(
            text.replace(/^[•·⚫○◦-]\s+|\d+\.\s+|[a-z]\)\s+/, '')
        );
        
        return `<${isOrdered ? 'ol' : 'ul'}>\n  <li>${cleanText}</li>\n</${isOrdered ? 'ol' : 'ul'}>\n`;
    }

    private isTableRow(block: TextBlock): boolean {
        const items = block.items;
        if (items.length < 2) return false;

        const gaps: number[] = [];
        for (let i = 1; i < items.length; i++) {
            const gap = items[i].transform[4] - (items[i-1].transform[4] + items[i-1].width);
            gaps.push(gap);
        }

        const avgGap = gaps.reduce((a, b) => a + b, 0) / gaps.length;
        const consistentSpacing = gaps.every(gap => Math.abs(gap - avgGap) < 5);

        return consistentSpacing && items.length >= 2;
    }

    private processTableRow(block: TextBlock): string[] {
        return block.items.map(item => item.str.trim());
    }

    private createTableHTML(rows: string[][]): string {
        if (rows.length < (this.options.minTableRows || 2)) {
            return rows.map(row => 
                `<p>${this.escapeHtml(row.join(' '))}</p>`
            ).join('\n');
        }

        const maxCols = Math.max(...rows.map(row => row.length));
        let html = '<table>\n';
        
        html += '  <thead>\n    <tr>\n';
        for (let i = 0; i < maxCols; i++) {
            html += `      <th>${this.escapeHtml((rows[0][i] || '').trim())}</th>\n`;
        }
        html += '    </tr>\n  </thead>\n';

        if (rows.length > 1) {
            html += '  <tbody>\n';
            for (let i = 1; i < rows.length; i++) {
                html += '    <tr>\n';
                for (let j = 0; j < maxCols; j++) {
                    html += `      <td>${this.escapeHtml((rows[i][j] || '').trim())}</td>\n`;
                }
                html += '    </tr>\n';
            }
            html += '  </tbody>\n';
        }

        html += '</table>\n';
        return html;
    }

    private postProcessHTML(html: string): string {
        return html
            .replace(/\n{3,}/g, '\n\n')
            .replace(/<(p|h[1-3]|td|th)>\s*<\/\1>/g, '')
            .replace(/<table>[\s\n]*<\/table>/g, '')
            .replace(/\s{2,}/g, ' ')
            .trim();
    }

    private escapeHtml(unsafe: string): string {
        return unsafe
            .replace(/&/g, "&amp;")
            .replace(/</g, "&lt;")
            .replace(/>/g, "&gt;")
            .replace(/"/g, "&quot;")
            .replace(/'/g, "&#039;");
    }

    protected async cleanup(): Promise<void> {
        if (this.pdfDocument) {
            await this.pdfDocument.destroy();
            this.pdfDocument = null;
        }
    }

    protected async extractMetadata(): Promise<Partial<ExtractionResult['metadata']>> {
        const metadata: Partial<ExtractionResult['metadata']> = {
            pageCount: this.pdfDocument ? this.pdfDocument.numPages : 0
        };

        try {
            if (this.pdfDocument) {
                const info = await this.pdfDocument.getMetadata() as PDFMetadata;
                if (info?.info) {
                    metadata.title = info.info.Title || '';
                    metadata.author = info.info.Author || '';
                }
            }
        } catch (error) {
            console.warn('Error extracting PDF metadata:', error);
        }

        return metadata;
    }
}