import os
import json
import hashlib
import re
from pathlib import Path

def get_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        # Read in chunks to avoid memory issues with large files
        for byte_block in iter(lambda: f.read(65536), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def get_ifc_schema(file_path):
    try:
        # Read the first 10KB of the file which should contain the header
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            header = f.read(10240)
            # Match FILE_SCHEMA(('IFC...'))
            match = re.search(r"FILE_SCHEMA\s*\(\s*\(\s*'(.*?)'\s*\)\s*\)\s*;", header)
            if match:
                return match.group(1)
    except Exception as e:
        print(f"Error reading schema from {file_path}: {e}")
    return None

def build_index():
    # Find all .ifc files recursively
    ifc_paths = sorted(Path('.').rglob('*.ifc'))
    
    index_data = []
    for p in ifc_paths:
        file_path = str(p)
        print(f"Indexing {file_path}...")
        
        try:
            size = p.stat().st_size
            sha256 = get_sha256(file_path)
            schema = get_ifc_schema(file_path)
            
            index_data.append({
                "path": file_path,
                "size_bytes": size,
                "sha256": sha256,
                "schema": schema
            })
        except Exception as e:
            print(f"Failed to index {file_path}: {e}")
    
    # Write index.json
    with open('index.json', 'w') as f:
        json.dump(index_data, f, indent=4)
    print(f"Created index.json with {len(index_data)} files.")

if __name__ == "__main__":
    build_index()
