#!/usr/bin/env python3
"""
excel_parser.py — Mechanical Excel extraction for the excel_to_counters skill.
Usage: python3 excel_parser.py /path/to/file.xlsx [--sheet SheetName]
Output: JSON written to stdout.

Handles:
- Header row at row 1, 3, 4 (skip logo/summary rows)
- Merged cells (uses top-left value of each merge region)
- Newline-delimited cells ("Name\nAddress" Rajasthan pattern)
- Column role auto-detection (fuzzy keyword match)
- Filename metadata hints (brand, state)
"""

import sys, json, re, os
import openpyxl
from openpyxl.utils import get_column_letter

# ── Column role keyword map ──────────────────────────────────────────────────
ROLE_KEYWORDS = {
    'name':           ['store name', 'outlet name', 'name', 'dealer', 'shop name', 'customer name'],
    'address':        ['store address', 'address', 'location', 'addr', 'store address'],
    'city':           ['city', 'town'],
    'state':          ['state'],
    'pincode':        ['pin code', 'pincode', 'pin', 'zip', 'postal'],
    'phone':          ['phone', 'mobile', 'contact no', 'contact number', 'ph no', 'ph#', 'phone #'],
    'gstin':          ['gstin', 'gst no', 'gstn', 'gst'],
    'contact_person': ['contact person', 'partner name', 'spoc', 'proprietor', 'owner'],
    'serial':         ['sr', 's.no', 'sr no', 'sr.', 'sr. no', 'serial', 's. no'],
    'territory':      ['territory', 'zone', 'region'],
    'counter_type':   ['type', 'counter type', 'channel'],
    'remarks':        ['remarks', 'notes', 'comment'],
    'branding_type':  ['branding type', 'type of branding'],
}

# ── Brand hints from filename ────────────────────────────────────────────────
BRAND_PATTERNS = ['HP', 'SAMSUNG', 'LENOVO', 'LIEBHERR', 'DELL', 'APPLE', 'LG', 'SONY']
STATE_PATTERNS = {
    'punjab': 'Punjab',
    'rajasthan': 'Rajasthan',
    'haryana': 'Haryana',
    'himachal': 'Himachal Pradesh',
    'uttarakhand': 'Uttarakhand',
    'gujarat': 'Gujarat',
    'maharashtra': 'Maharashtra',
    'delhi': 'Delhi',
    'up': 'Uttar Pradesh',
    'mp': 'Madhya Pradesh',
    'karnataka': 'Karnataka',
}


def unmerge_values(ws):
    """Return dict {(row,col): value} expanding merged cells to their top-left value."""
    merged_map = {}
    for merged_range in ws.merged_cells.ranges:
        top_left = ws.cell(merged_range.min_row, merged_range.min_col).value
        for row in range(merged_range.min_row, merged_range.max_row + 1):
            for col in range(merged_range.min_col, merged_range.max_col + 1):
                merged_map[(row, col)] = top_left
    return merged_map


def get_cell_value(ws, row, col, merged_map):
    key = (row, col)
    if key in merged_map:
        return merged_map[key]
    return ws.cell(row, col).value


def detect_header_row(ws, merged_map, max_scan=12):
    """
    Find the best header row by scanning up to max_scan rows.
    Criteria:
    - 3+ non-empty cells, 3+ string cells (keyword-like)
    - At least one text cell must appear within the FIRST 5 columns
      (guards against mid-sheet label rows like 'Supporting (Nos.)' at col 16)
    - NOT a pure-numeric row
    Picks the row with the MOST matching text cells (highest score).
    Stops early if score >= 6 (unambiguous header found).
    Returns 1-indexed row number.
    """
    best_row = 1
    best_score = 0
    for r in range(1, min(max_scan + 1, ws.max_row + 1)):
        row_vals = [get_cell_value(ws, r, c, merged_map) for c in range(1, ws.max_column + 1)]
        non_empty = [v for v in row_vals if v not in (None, '', 0)]
        text_with_idx = [(i, v) for i, v in enumerate(row_vals)
                         if isinstance(v, str) and len(str(v).strip()) > 1]
        numeric_only = all(isinstance(v, (int, float)) for v in non_empty)
        if numeric_only or len(non_empty) < 3 or len(text_with_idx) < 2:
            continue
        # Must have at least one text cell in first 5 columns
        early_text = [i for i, _ in text_with_idx if i < 5]
        if not early_text:
            continue
        score = len(text_with_idx)
        if score > best_score:
            best_score = score
            best_row = r
        if score >= 6:
            break
    return best_row


def detect_column_roles(header_values):
    """
    Map column index (0-based) to role string.
    Returns dict {role: col_index}.
    Prioritises longer keyword matches over shorter ones.
    """
    mapping = {}
    header_lower = [(i, str(h).lower().strip() if h else '') for i, h in enumerate(header_values)]
    for role, keywords in ROLE_KEYWORDS.items():
        # Sort keywords longest-first so "store name" beats "name"
        for kw in sorted(keywords, key=len, reverse=True):
            for i, h in header_lower:
                if kw in h:
                    if role not in mapping:
                        mapping[role] = i
                    break
            if role in mapping:
                break
    return mapping


def extract_filename_hints(filepath):
    """Guess brand and state from filename."""
    fname = os.path.basename(filepath).upper()
    hints = {}
    for brand in BRAND_PATTERNS:
        if brand in fname:
            # HP is ambiguous (Himachal Pradesh vs brand)
            if brand == 'HP' and ('HIMACHAL' in fname or 'H.P' in fname):
                hints['state'] = 'Himachal Pradesh'
            else:
                hints['brand'] = brand
            break
    fname_lower = os.path.basename(filepath).lower()
    for key, state in STATE_PATTERNS.items():
        if key in fname_lower:
            if 'state' not in hints:
                hints['state'] = state
            break
    return hints


def clean_phone(val):
    """Normalise phone: first 10-digit number in the string."""
    if val is None:
        return ''
    s = re.sub(r'[^0-9/,;]', ' ', str(val))
    # Find first 10-digit sequence
    match = re.search(r'(?<!\d)(\d{10})(?!\d)', s.replace(' ', ''))
    if match:
        return match.group(1)
    # Fall back to raw cleaned string (may be multiple numbers)
    return str(val).strip()


def extract_pincode_from_text(text):
    """Find a 6-digit number that looks like an Indian PIN."""
    if not text:
        return ''
    match = re.search(r'\b([1-9][0-9]{5})\b', str(text))
    return match.group(1) if match else ''


def parse_sheet(ws, merged_map, filepath):
    header_row = detect_header_row(ws, merged_map)
    header_vals = [get_cell_value(ws, header_row, c, merged_map) for c in range(1, ws.max_column + 1)]
    col_roles = detect_column_roles(header_vals)

    rows = []
    prev_serial = None
    for r in range(header_row + 1, ws.max_row + 1):
        row_vals = [get_cell_value(ws, r, c, merged_map) for c in range(1, ws.max_column + 1)]

        # Skip completely empty rows
        if all(v in (None, '', 0) for v in row_vals):
            continue

        def get_role(role):
            idx = col_roles.get(role)
            if idx is None:
                return ''
            v = row_vals[idx] if idx < len(row_vals) else None
            return str(v).strip() if v not in (None, '') else ''

        serial_raw = get_role('serial')
        # Skip summary/totals rows (serial is blank but row has numeric values in later cols)
        if not serial_raw and prev_serial:
            non_empty = [v for v in row_vals if v not in (None, '')]
            text_cells = [v for v in non_empty if isinstance(v, str) and len(str(v).strip()) > 2]
            if len(text_cells) < 2:
                continue

        if serial_raw:
            prev_serial = serial_raw

        raw_name_addr = get_role('name')  # may be "Name\nAddress" merged cell
        raw_address   = get_role('address')
        raw_city      = get_role('city')
        raw_state     = get_role('state')
        raw_pincode   = get_role('pincode')
        raw_phone     = get_role('phone')
        raw_gstin     = get_role('gstin')
        raw_contact   = get_role('contact_person')

        # ── Rajasthan pattern: name+address in one cell with \n ──────────────
        # E.g. "Sunil Vaishnav\nC-135 pandit dharam narayan compound..."
        name_part = raw_name_addr
        addr_part = raw_address
        if '\n' in raw_name_addr and not raw_address:
            parts = raw_name_addr.split('\n', 1)
            name_part = parts[0].strip()
            addr_part = parts[1].strip() if len(parts) > 1 else ''

        # ── Continuation row (merged cell B6:B7 in Rajasthan) ───────────────
        # If the "name" cell is empty but address has content, it's a continuation
        is_continuation = (not name_part and addr_part and prev_serial is not None)

        # ── Extract pincode from address when missing ────────────────────────
        embedded_pincode = ''
        if not raw_pincode:
            embedded_pincode = extract_pincode_from_text(addr_part) or extract_pincode_from_text(raw_city)

        row_obj = {
            '_row': r,
            '_serial': serial_raw,
            '_is_continuation': is_continuation,
            'raw_name': name_part,
            'raw_address': addr_part,
            'raw_city': raw_city,
            'raw_state': raw_state,
            'raw_pincode': raw_pincode or embedded_pincode,
            'raw_phone': clean_phone(raw_phone),
            'raw_gstin': raw_gstin,
            'raw_contact': raw_contact,
            # Flags for AI normalization
            '_needs_city_extraction': not raw_city,
            '_needs_pincode_extraction': not raw_pincode,
            '_has_co_pattern': bool(re.search(r'\bC/O\b|\bC\.O\b|\bc/o\b', name_part, re.I)),
            '_name_addr_merged': '\n' in raw_name_addr and not raw_address,
        }
        rows.append(row_obj)

    return {
        'header_row': header_row,
        'header_values': [str(h) if h is not None else '' for h in header_vals],
        'col_roles': col_roles,
        'rows': rows,
        'row_count': len(rows),
        'needs_ai_extraction': sum(1 for r in rows if r['_needs_city_extraction'] or r['_has_co_pattern']),
    }


def main():
    args = sys.argv[1:]
    if not args:
        print(json.dumps({'error': 'Usage: excel_parser.py <file.xlsx> [--sheet SheetName]'}))
        sys.exit(1)

    filepath = args[0]
    target_sheet = None
    if '--sheet' in args:
        idx = args.index('--sheet')
        if idx + 1 < len(args):
            target_sheet = args[idx + 1]

    if not os.path.exists(filepath):
        print(json.dumps({'error': f'File not found: {filepath}'}))
        sys.exit(1)

    try:
        wb = openpyxl.load_workbook(filepath, data_only=True)
    except Exception as e:
        print(json.dumps({'error': f'Cannot open file: {e}'}))
        sys.exit(1)

    filename_hints = extract_filename_hints(filepath)
    sheets_output = {}

    sheet_names = [target_sheet] if target_sheet else wb.sheetnames
    for sname in sheet_names:
        if sname not in wb.sheetnames:
            continue
        ws = wb[sname]
        if ws.max_row < 2:
            continue
        merged_map = unmerge_values(ws)
        sheets_output[sname] = parse_sheet(ws, merged_map, filepath)

    output = {
        'filepath': filepath,
        'filename': os.path.basename(filepath),
        'filename_hints': filename_hints,
        'sheets': sheets_output,
        'sheet_names': list(sheets_output.keys()),
    }
    print(json.dumps(output, ensure_ascii=False, indent=2))


if __name__ == '__main__':
    main()