#!/usr/bin/env python3
"""
Extract text data from DOJ document collection files.
Parses .dat and .opt files to extract document metadata.
"""

import csv
import sys
from pathlib import Path


def parse_dat_file(dat_path):
    """
    Parse the .dat file which contains document IDs and PDF filenames.
    Uses 0xFE as delimiter.
    """
    print(f"\n{'='*70}")
    print(f"Parsing DAT file: {dat_path}")
    print(f"{'='*70}\n")

    with open(dat_path, 'rb') as f:
        content = f.read()

    # Split by 0xFE delimiter
    parts = content.split(b'\xfe')

    documents = []
    current_doc = {}

    # Parse the fields
    for i, part in enumerate(parts):
        decoded = part.decode('utf-8', errors='ignore').strip()
        if not decoded or decoded in ['Prod Beg', 'Prod End', 'Filename', 'FILE_PATH']:
            continue

        # Look for document ID pattern
        if decoded.startswith('DOJ-OGR-'):
            if 'doc_id_start' not in current_doc:
                current_doc['doc_id_start'] = decoded
            elif 'doc_id_end' not in current_doc:
                current_doc['doc_id_end'] = decoded
        # Look for PDF filename
        elif '.pdf' in decoded.lower():
            current_doc['filename'] = decoded
            # Save document when we have all parts
            if 'doc_id_start' in current_doc and 'doc_id_end' in current_doc:
                documents.append(current_doc.copy())
                current_doc = {}

    # Print summary
    print(f"Total documents found: {len(documents)}\n")
    print("Sample documents:")
    print("-" * 70)
    for i, doc in enumerate(documents[:10]):
        print(f"{i+1}. {doc.get('doc_id_start', 'N/A')} - {doc.get('doc_id_end', 'N/A')}")
        print(f"   File: {doc.get('filename', 'N/A')}")
        print()

    if len(documents) > 10:
        print(f"... and {len(documents) - 10} more documents\n")

    return documents


def parse_opt_file(opt_path):
    """
    Parse the .opt CSV file which contains image references.
    """
    print(f"\n{'='*70}")
    print(f"Parsing OPT file: {opt_path}")
    print(f"{'='*70}\n")

    images = []

    with open(opt_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(',')
            if len(parts) >= 3:
                images.append({
                    'doc_id': parts[0],
                    'volume': parts[1],
                    'image_path': parts[2],
                    'flag': parts[3] if len(parts) > 3 else '',
                })

    print(f"Total image references: {len(images)}\n")
    print("Sample image references:")
    print("-" * 70)
    for i, img in enumerate(images[:10]):
        print(f"{i+1}. {img['doc_id']}: {img['image_path']}")

    if len(images) > 10:
        print(f"... and {len(images) - 10} more images\n")

    return images


def export_to_text(documents, images, output_dir):
    """
    Export extracted data to text files.
    """
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # Export documents list
    doc_file = output_path / "documents_list.txt"
    with open(doc_file, 'w', encoding='utf-8') as f:
        f.write("DOJ Document Collection - Extracted Documents\n")
        f.write("=" * 70 + "\n\n")
        for i, doc in enumerate(documents, 1):
            f.write(f"{i}. Document ID Range: {doc.get('doc_id_start')} - {doc.get('doc_id_end')}\n")
            f.write(f"   Filename: {doc.get('filename')}\n\n")

    print(f"✓ Exported documents list to: {doc_file}")

    # Export images list
    img_file = output_path / "images_list.txt"
    with open(img_file, 'w', encoding='utf-8') as f:
        f.write("DOJ Document Collection - Image References\n")
        f.write("=" * 70 + "\n\n")
        for i, img in enumerate(images, 1):
            f.write(f"{i}. {img['doc_id']}: {img['image_path']}\n")

    print(f"✓ Exported images list to: {img_file}")

    # Export CSV versions
    doc_csv = output_path / "documents.csv"
    with open(doc_csv, 'w', newline='', encoding='utf-8') as f:
        if documents:
            writer = csv.DictWriter(f, fieldnames=documents[0].keys())
            writer.writeheader()
            writer.writerows(documents)

    print(f"✓ Exported documents CSV to: {doc_csv}")

    img_csv = output_path / "images.csv"
    with open(img_csv, 'w', newline='', encoding='utf-8') as f:
        if images:
            writer = csv.DictWriter(f, fieldnames=images[0].keys())
            writer.writeheader()
            writer.writerows(images)

    print(f"✓ Exported images CSV to: {img_csv}")


def main():
    data_dir = Path("data")

    # Find .dat and .opt files
    dat_files = list(data_dir.glob("*.dat"))
    opt_files = list(data_dir.glob("*.opt"))

    if not dat_files and not opt_files:
        print("Error: No .dat or .opt files found in data directory")
        sys.exit(1)

    all_documents = []
    all_images = []

    # Parse all .dat files
    for dat_file in dat_files:
        docs = parse_dat_file(dat_file)
        all_documents.extend(docs)

    # Parse all .opt files
    for opt_file in opt_files:
        imgs = parse_opt_file(opt_file)
        all_images.extend(imgs)

    # Export extracted data
    if all_documents or all_images:
        print(f"\n{'='*70}")
        print("Exporting extracted data...")
        print(f"{'='*70}\n")
        export_to_text(all_documents, all_images, "extracted")

        print(f"\n{'='*70}")
        print("Extraction Complete!")
        print(f"{'='*70}")
        print(f"Total documents extracted: {len(all_documents)}")
        print(f"Total images extracted: {len(all_images)}")
        print("\nExtracted files saved to 'extracted/' directory")


if __name__ == "__main__":
    main()