


import os
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime, timedelta
import time
from typing import Dict, List, Tuple, Optional, Union, Any
import logging
import re
import csv
import pickle
from dotenv import load_dotenv
import ast
from collections import Counter
from tqdm import tqdm
import h3


# --- Scikit-learn Imports ---
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# --- End Scikit-learn Imports ---

# --- Global Settings & Constants ---
np.float_ = np.float64

OPENROUTER_MODEL_NAME = "deepseek/deepseek-r1-0528-qwen3-8b:free"
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("pricing_agent_module.log", mode='w'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("pricing_agent_module")

MIN_AMENITY_FREQUENCY = 5

# --- Configuration ---
BATCH_SIZE = 10
API_CACHE_FILE = "api_cache.pkl"
LLM_CACHE_FILE = "llm_cache.pkl"
PROMPT_TEMPLATE_FILE = "prompt_templates.json"
PROMPT_HISTORY_FILE = "prompt_history.csv"

# --- Load Environment Variables ---
load_dotenv()

# --- Helper Function for JSON Serialization ---
def convert_numpy_types(obj):
    """Converts numpy types (and pandas Timestamps) to JSON-serializable types."""
    if obj is None: return None
    if isinstance(obj, (str, int, bool)): return obj
    if isinstance(obj, float): return None if np.isnan(obj) or np.isinf(obj) else obj
    if isinstance(obj, (
            np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32,
            np.uint64)): return int(obj)
    if isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return None if np.isnan(obj) or np.isinf(
        obj) else float(obj)
    if isinstance(obj, (np.bool_)): return bool(obj)
    if isinstance(obj, (np.void)): return None
    if isinstance(obj, pd.Timestamp): return obj.isoformat() if pd.notna(obj) else None
    if isinstance(obj, dict): return {k: convert_numpy_types(v) for k, v in obj.items()}
    if isinstance(obj, list): return [convert_numpy_types(item) for item in obj]
    if isinstance(obj, np.ndarray): return convert_numpy_types(obj.tolist())
    if isinstance(obj, Exception): return {"error_type": type(obj).__name__, "message": str(obj)}
    try:
        if pd.api.types.is_datetime64_any_dtype(obj):
            if pd.notna(obj):
                try:
                    return pd.Timestamp(obj).isoformat()
                except Exception:
                    return None
            else:
                return None
    except Exception:
        pass

    if isinstance(obj, (np.complex128, np.complex64)): return {'real': obj.real, 'imag': obj.imag}

    logger.debug(f"Unhandled type in convert_numpy_types: {type(obj)}. Converting to string.")
    try:
        return str(obj)
    except Exception:
        return None

# --- Core Classes ---

class ApiManager:
    """Manages API calls to external data sources with caching and rate limiting."""
    def __init__(self, api_keys: Dict[str, str] = None):
        self.api_keys = api_keys or {}
        self.rate_limits = {"alpha_vantage": {"calls_per_minute": 5, "last_call": None},
                            "census": {"calls_per_minute": 10, "last_call": None},
                            "fred": {"calls_per_minute": 120, "last_call": None},
                            "ticketmaster": {"calls_per_minute": 5, "last_call": None}, }
        self.cache = {"census": {}, "fred": {}, "alpha_vantage": {}, "ticketmaster": {}}
        self.load_cache(API_CACHE_FILE, silent=True)

    def _respect_rate_limit(self, api_name: str):
        if api_name in self.rate_limits:
            limit_info = self.rate_limits[api_name];
            wait_time = (60 / limit_info["calls_per_minute"]) + 0.1
            if limit_info["last_call"] is not None:
                elapsed = time.time() - limit_info["last_call"]
                if elapsed < wait_time: sleep_duration = wait_time - elapsed; logger.debug(
                    f"Rate limit: sleeping for {sleep_duration:.2f} seconds for {api_name}"); time.sleep(sleep_duration)
            self.rate_limits[api_name]["last_call"] = time.time()

    def _get_cache_key(self, api_name: str, **params) -> str:
        param_str = json.dumps(params, sort_keys=True); return f"{api_name}_{param_str}"

    def get_census_vacancy_data(self, state: str, county: str = None) -> Dict:
        cache_key = self._get_cache_key("census", state=state, county=county)
        if cache_key in self.cache["census"]:
            cache_entry = self.cache["census"][cache_key]
            if 'timestamp' in cache_entry and time.time() - cache_entry['timestamp'] < 86400: logger.info(
                f"Using cached Census data for state {state}, county {county}"); return cache_entry['data']
        self._respect_rate_limit("census");
        base_url = "https://api.census.gov/data";
        dataset = "timeseries/eits/hvs";
        params = {"get": "VACANCY_RATE,HOMEOWNERSHIP_RATE", "for": f"state:{state}", "time": "from+2023"}
        if county: params["in"] = f"county:{county}"
        try:
            response = requests.get(f"{base_url}/{dataset}", params=params);
            response.raise_for_status();
            data = response.json()
            if len(data) < 2: logger.warning(f"No Census data found for state {state}, county {county}"); return {
                "success": False, "error": "No data returned from API"}
            headers = data[0];
            values = data[1:];
            result = [dict(zip(headers, value)) for value in values];
            result_dict = {"success": True, "data": result}
            self.cache["census"][cache_key] = {'data': result_dict, 'timestamp': time.time()};
            return result_dict
        except requests.exceptions.RequestException as e:
            logger.error(f"Census API request error: {e}"); return {"success": False, "error": str(e)}
        except json.JSONDecodeError as e:
            logger.error(f"Census API JSON error: {e} - Response text: {response.text[:500]}"); return {
                "success": False, "error": f"JSON decode error: {e}"}
        except Exception as e:
            logger.error(f"Unexpected Census API error: {e}", exc_info=True); return {"success": False, "error": str(e)}

    def get_fred_housing_data(self, series_ids: List[str]) -> Dict:
        cache_key = self._get_cache_key("fred", series_ids=sorted(series_ids))
        if cache_key in self.cache["fred"]:
            cache_entry = self.cache["fred"][cache_key]
            if 'timestamp' in cache_entry and time.time() - cache_entry['timestamp'] < 43200: logger.info(
                f"Using cached FRED data for series {series_ids}"); return cache_entry['data']
        fred_api_key = self.api_keys.get("fred");
        if not fred_api_key: logger.error("FRED API key missing."); return {"success": False,
                                                                            "error": "FRED API key not provided"}
        base_url = "https://api.stlouisfed.org/fred/series/observations";
        all_data = {};
        success_flag = True
        for series_id in series_ids:
            self._respect_rate_limit("fred");
            params = {"series_id": series_id, "api_key": fred_api_key, "file_type": "json",
                      "observation_start": (datetime.now() - timedelta(days=365 * 2)).strftime("%Y-%m-%d"),
                      "observation_end": datetime.now().strftime("%Y-%m-%d"), "sort_order": "desc", "limit": 10}
            try:
                response = requests.get(base_url, params=params);
                response.raise_for_status();
                data = response.json().get("observations", []);
                all_data[series_id] = data;
                logger.debug(f"Successfully fetched FRED data for {series_id}")
            except requests.exceptions.RequestException as e:
                logger.error(f"FRED API request error for {series_id}: {e}"); all_data[series_id] = {
                    "error": str(e)}; success_flag = False
            except json.JSONDecodeError as e:
                logger.error(f"FRED API JSON error for {series_id}: {e} - Response text: {response.text[:500]}");
                all_data[series_id] = {"error": f"JSON decode error: {e}"}; success_flag = False
            except Exception as e:
                logger.error(f"Unexpected FRED API error for {series_id}: {e}", exc_info=True); all_data[series_id] = {
                    "error": str(e)}; success_flag = False
        result_dict = {"success": success_flag, "data": all_data};
        self.cache["fred"][cache_key] = {'data': result_dict, 'timestamp': time.time()};
        return result_dict

    def get_alpha_vantage_stock_data(self, symbol: str) -> Dict:
        cache_key = self._get_cache_key("alpha_vantage", symbol=symbol)
        if cache_key in self.cache["alpha_vantage"]:
            cache_entry = self.cache["alpha_vantage"][cache_key]
            if 'timestamp' in cache_entry and time.time() - cache_entry['timestamp'] < 3600: logger.info(
                f"Using cached Alpha Vantage data for {symbol}"); return cache_entry['data']
        self._respect_rate_limit("alpha_vantage");
        alpha_vantage_key = self.api_keys.get("alpha_vantage");
        if not alpha_vantage_key: logger.error("Alpha Vantage API key missing."); return {"success": False,
                                                                                          "error": "Alpha Vantage API key not provided"}
        base_url = "https://www.alphavantage.co/query";
        params = {"function": "TIME_SERIES_DAILY", "symbol": symbol, "outputsize": "compact",
                  "apikey": alpha_vantage_key}
        try:
            response = requests.get(base_url, params=params);
            response.raise_for_status();
            data = response.json()
            if "Note" in data or "Information" in data: logger.warning(
                f"Alpha Vantage API limit note/info: {data.get('Note', data.get('Information'))}")
            if "Error Message" in data: logger.error(
                f"Alpha Vantage API Error for {symbol}: {data['Error Message']}"); return {"success": False,
                                                                                           "error": data[
                                                                                               'Error Message']}
            result_dict = {"success": True, "data": data};
            self.cache["alpha_vantage"][cache_key] = {'data': result_dict, 'timestamp': time.time()};
            return result_dict
        except requests.exceptions.RequestException as e:
            logger.error(f"Alpha Vantage API request error: {e}"); return {"success": False, "error": str(e)}
        except json.JSONDecodeError as e:
            logger.error(f"Alpha Vantage API JSON error: {e} - Response text: {response.text[:500]}"); return {
                "success": False, "error": f"JSON decode error: {e}"}
        except Exception as e:
            logger.error(f"Unexpected Alpha Vantage API error: {e}", exc_info=True); return {"success": False,
                                                                                             "error": str(e)}

    def get_ticketmaster_events(self, lat: float, lon: float, radius: int = 25) -> Dict:
        cache_key = self._get_cache_key("ticketmaster", lat=lat, lon=lon, radius=radius)
        if cache_key in self.cache["ticketmaster"]:
            cache_entry = self.cache["ticketmaster"][cache_key]
            if 'timestamp' in cache_entry and time.time() - cache_entry['timestamp'] < 21600: logger.info(
                f"Using cached Ticketmaster data for location ({lat:.4f}, {lon:.4f})"); return cache_entry['data']
        self._respect_rate_limit("ticketmaster");
        ticketmaster_key = self.api_keys.get("ticketmaster");
        if not ticketmaster_key: logger.error("Ticketmaster API key missing."); return {"success": False,
                                                                                        "error": "Ticketmaster API key not provided"}
        base_url = "https://app.ticketmaster.com/discovery/v2/events";
        start_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ");
        end_date = (datetime.now() + timedelta(days=90)).strftime("%Y-%m-%dT%H:%M:%SZ")
        params = {"apikey": ticketmaster_key, "latlong": f"{lat},{lon}", "radius": radius, "unit": "miles", "size": 100,
                  "startDateTime": start_date, "endDateTime": end_date, "sort": "date,asc"}
        try:
            response = requests.get(base_url, params=params);
            response.raise_for_status();
            data = response.json()
            result_dict = {"success": True, "data": data};
            self.cache["ticketmaster"][cache_key] = {'data': result_dict, 'timestamp': time.time()};
            return result_dict
        except requests.exceptions.RequestException as e:
            logger.error(f"Ticketmaster API request error: {e}"); return {"success": False, "error": str(e)}
        except json.JSONDecodeError as e:
            logger.error(f"Ticketmaster API JSON error: {e} - Response text: {response.text[:500]}"); return {
                "success": False, "error": f"JSON decode error: {e}"}
        except Exception as e:
            logger.error(f"Unexpected Ticketmaster API error: {e}", exc_info=True); return {"success": False,
                                                                                            "error": str(e)}

    def save_cache(self, cache_path: str = API_CACHE_FILE) -> bool:
        try:
            with open(cache_path, 'wb') as f:
                pickle.dump(self.cache, f, protocol=pickle.HIGHEST_PROTOCOL)
            logger.info(f"API cache saved to {cache_path}");
            return True
        except Exception as e:
            logger.error(f"Error saving API cache to {cache_path}: {e}", exc_info=True); return False

    def load_cache(self, cache_path: str = API_CACHE_FILE, silent: bool = False) -> bool:
        try:
            if os.path.exists(cache_path):
                with open(cache_path, 'rb') as f:
                    loaded_cache = pickle.load(f)
                if isinstance(loaded_cache, dict) and all(
                    k in loaded_cache for k in self.cache.keys()): self.cache = loaded_cache;
                if not silent:
                    logger.info(f"API cache loaded from {cache_path}"); return True
                else:
                    logger.warning(
                        f"Cache file {cache_path} has unexpected format. Starting with empty cache."); self.cache = {
                        "census": {}, "fred": {}, "alpha_vantage": {}, "ticketmaster": {}}; return False
            else:
                if not silent: logger.info("No API cache file found. Starting with empty cache."); return False
        except Exception as e:
            logger.error(f"Error loading API cache from {cache_path}: {e}. Starting with empty cache.",
                         exc_info=True); self.cache = {"census": {}, "fred": {}, "alpha_vantage": {},
                                                       "ticketmaster": {}}; return False

class DataProcessor:
    """Processes and combines data from various sources"""
    def __init__(self):
        self.listings_df: Optional[pd.DataFrame] = None
        self.market_df: Optional[pd.DataFrame] = None
        self.feature_cache: Dict = {}

    def load_dataframes(self, listings_df: pd.DataFrame, market_df: Optional[pd.DataFrame] = None) -> Tuple[bool, str]:
        """Loads and processes pandas DataFrames."""
        try:
            if listings_df is None or listings_df.empty:
                return False, "Listings DataFrame is empty or None."
            self.listings_df = listings_df.copy()
            logger.info(f"Loaded {len(self.listings_df)} listings from DataFrame.")

            if market_df is not None and not market_df.empty:
                self.market_df = market_df.copy()
                logger.info(f"Loaded {len(self.market_df)} market listings from DataFrame.")
            else:
                self.market_df = None
                logger.info("No market data DataFrame provided.")

            self._post_load_processing()
            return True, "DataFrames loaded and processed successfully."
        except Exception as e:
            logger.error(f"General error loading DataFrames: {e}", exc_info=True)
            self.listings_df = None
            self.market_df = None
            return False, str(e)

    def _post_load_processing(self):
        """Standardized processing pipeline for both listings and market data."""
        if self.listings_df is not None:
            logger.info("Running post-load processing on listings_df...")
            self.listings_df.columns = self.listings_df.columns.str.strip().str.lower()
            if 'id' not in self.listings_df.columns:
                logger.warning("Listings DF missing 'id' column. Creating 'id' from index.")
                self.listings_df['id'] = 'idx_' + self.listings_df.index.astype(str)

            occupancy_cols = [col for col in self.listings_df.columns if col.startswith('occupancy_')]
            numeric_cols_to_convert = ['price', 'lat', 'long', 'ratings', 'bedrooms', 'bathrooms', 'guests',
                                       'beds'] + occupancy_cols

            for col in numeric_cols_to_convert:
                if col in self.listings_df.columns:
                    self.listings_df[col] = pd.to_numeric(self.listings_df[col], errors='coerce')
                else:
                    logger.debug(f"Column '{col}' not found in listings_df for numeric conversion.")

            self.listings_df = self.parse_listing_data(self.listings_df)
            self.clean_data(self.listings_df)
            self.enrich_data(self.listings_df)
            logger.info("Post-load processing for listings_df complete.")

        if self.market_df is not None:
            logger.info("Running post-load processing on market_df...")
            self.market_df.columns = self.market_df.columns.str.strip().str.lower()
            if 'id' not in self.market_df.columns:
                logger.warning("Market DF missing 'id' column. Creating 'id' from index.")
                self.market_df['id'] = 'idx_' + self.market_df.index.astype(str)

            occupancy_cols_market = [col for col in self.market_df.columns if col.startswith('occupancy_')]
            numeric_cols_market = ['price', 'lat', 'long', 'ratings', 'bedrooms', 'bathrooms', 'guests',
                                   'beds'] + occupancy_cols_market

            for col in numeric_cols_market:
                if col in self.market_df.columns:
                    self.market_df[col] = pd.to_numeric(self.market_df[col], errors='coerce')
                else:
                    logger.debug(f"Column '{col}' not found in market_df for numeric conversion.")

            self.market_df = self.parse_listing_data(self.market_df)
            self.clean_data(self.market_df)
            self.enrich_data(self.market_df)
            logger.info("Post-load processing for market_df complete.")

    def parse_listing_data(self, df: pd.DataFrame) -> pd.DataFrame:
            if df is None:
                logger.error("No DataFrame provided to parse_listing_data.")
                return pd.DataFrame()

            df_copy = df.copy()
            json_columns_to_parse = ['details', 'pricing_details']
            for column in json_columns_to_parse:
                if column in df_copy.columns:
                    logger.debug(f"Attempting to parse column '{column}' as JSON or list.")
                    parsed_col = []
                    invalid_count = 0
                    for item in df_copy[column]:
                        parsed_item = None
                        if isinstance(item, str):
                            try:
                                item_cleaned = item.replace('\\"', '"').replace('\\\\', '\\')
                                parsed_item = json.loads(item_cleaned)
                            except json.JSONDecodeError:
                                try:
                                    if item.strip().startswith('[') and item.strip().endswith(']'):
                                        parsed_item = ast.literal_eval(item.strip())
                                        if not isinstance(parsed_item, list):
                                            parsed_item = None
                                except (ValueError, SyntaxError, TypeError):
                                    parsed_item = None
                                if parsed_item is None:
                                    invalid_count += 1
                            except Exception as json_e:
                                logger.debug(
                                    f"Unexpected error parsing item in {column}: {json_e} - Item: {item[:100]}...")
                                parsed_item = None
                                invalid_count += 1
                        elif isinstance(item, (dict, list)):
                            parsed_item = item
                        parsed_col.append(parsed_item)
                    if invalid_count > 0:
                        logger.warning(
                            f"Could not parse {invalid_count}/{len(df_copy)} items in column '{column}' as JSON or list.")
                    df_copy[column] = parsed_col

            details_parsed_as_list = False
            non_null_details = pd.Series(dtype='object')

            if 'details' in df_copy.columns:
                non_null_details = df_copy['details'].dropna()

            if not non_null_details.empty:
                details_parsed_as_list = isinstance(non_null_details.iloc[0], list)

            if details_parsed_as_list:
                logger.debug(
                    "Attempting to extract structured details (bedrooms, bathrooms, beds) from 'details' list.")
                try:
                    def extract_detail(detail_list, search_term_singular, search_term_plural):
                        if not isinstance(detail_list, list): return None
                        for item in detail_list:
                            if isinstance(item, str):
                                item_lower = item.lower()
                                if search_term_singular in item_lower or search_term_plural in item_lower:
                                    pattern = r"(\d+(\.\d)?)\s+(?:" + re.escape(
                                        search_term_singular) + r"|" + re.escape(
                                        search_term_plural) + r")"
                                    match = re.search(pattern, item_lower)
                                    if match:
                                        try:
                                            val = float(match.group(1))
                                            return int(val) if val.is_integer() else val
                                        except (ValueError, TypeError):
                                            continue
                        return None

                    details_to_extract = {'bedrooms': ('bedroom', 'bedrooms'), 'bathrooms': ('bath', 'baths'),
                                          'beds': ('bed', 'beds'), 'guests': ('guest', 'guests')}
                    for detail_key, (term_s, term_p) in details_to_extract.items():
                        should_extract = (detail_key not in df_copy.columns or df_copy.get(detail_key, pd.Series(
                            dtype=float)).isnull().mean() > 0.8)
                        if should_extract:
                            logger.info(f"Attempting to extract '{detail_key}' from 'details' column.")
                            extracted_values = df_copy['details'].apply(lambda x: extract_detail(x, term_s, term_p))
                            if not extracted_values.isnull().all():
                                df_copy[detail_key] = extracted_values
                                logger.info(f"Extracted '{detail_key}' from 'details' column.")
                            else:
                                logger.warning(f"Could not extract meaningful '{detail_key}' from 'details'.")
                        elif detail_key in df_copy.columns:
                            nan_mask = df_copy[detail_key].isnull()
                            if nan_mask.any():
                                logger.info(f"Attempting to fill missing '{detail_key}' from 'details' column.")
                                extracted_for_nan = df_copy.loc[nan_mask, 'details'].apply(
                                    lambda x: extract_detail(x, term_s, term_p))
                                if not extracted_for_nan.isnull().all():
                                    df_copy.loc[nan_mask, detail_key] = extracted_for_nan
                                    logger.info(
                                        f"Filled {extracted_for_nan.notnull().sum()} missing '{detail_key}' values from 'details'.")
                                else:
                                    logger.info(f"Could not fill missing '{detail_key}' from 'details'.")
                except Exception as e:
                    logger.warning(f"Failed during structured detail extraction: {e}", exc_info=True)
            elif 'details' in df_copy.columns:
                logger.warning(
                    "Column 'details' exists but was not parsed into a list structure. Skipping detail extraction.")

            amenity_pattern = re.compile(r"[\s\-:]+")
            normalize_amenity = lambda x: amenity_pattern.sub(" ", x.strip().lower()).lower() if isinstance(x,
                                                                                                            str) else x

            if 'amenities_list' in df_copy.columns:
                logger.info("Processing 'amenities_list' column.")
                processed_amenities_col = []
                for idx, item in enumerate(df_copy['amenities_list']):
                    current_row_amenities = set()
                    if isinstance(item, list):
                        for amenity_name in item:
                            if isinstance(amenity_name, str) and amenity_name.strip():
                                normalized = normalize_amenity(amenity_name)
                                if normalized: current_row_amenities.add(normalized)
                    elif isinstance(item, str):
                        if item.strip().startswith('[') and item.strip().endswith(']'):
                            try:
                                parsed_list = ast.literal_eval(item)
                                if isinstance(parsed_list, list):
                                    for amenity_name in parsed_list:
                                        if isinstance(amenity_name, str) and amenity_name.strip():
                                            normalized = normalize_amenity(amenity_name)
                                            if normalized: current_row_amenities.add(normalized)
                            except (ValueError, SyntaxError):
                                split_items = [s.strip() for s in item.split(',') if s.strip()]
                                for amenity_name in split_items:
                                    normalized = normalize_amenity(amenity_name)
                                    if normalized: current_row_amenities.add(normalized)
                        else:
                            split_items = [s.strip() for s in item.split(',') if s.strip()]
                            for amenity_name in split_items:
                                normalized = normalize_amenity(amenity_name)
                                if normalized: current_row_amenities.add(normalized)
                    elif pd.isna(item):
                        pass
                    else:
                        logger.warning(f"Unexpected type in 'amenities_list' at index {idx}: {type(item)}")
                    processed_amenities_col.append(sorted(list(current_row_amenities)))
                df_copy['amenities_list_processed'] = processed_amenities_col
                logger.info("Successfully processed 'amenities_list' column.")

            should_fallback_to_amenities_col = True
            if 'amenities_list_processed' in df_copy.columns:
                if df_copy['amenities_list_processed'].apply(lambda x: len(x) > 0).sum() > 0:
                    should_fallback_to_amenities_col = False
                    logger.info("Using data from 'amenities_list_processed'.")
                    df_copy['amenities_list'] = df_copy['amenities_list_processed']

            if should_fallback_to_amenities_col and 'amenities' in df_copy.columns:
                logger.warning("Falling back to parsing complex 'amenities' column (JSON structure).")
                extracted_amenities_from_complex = []
                parse_errors = 0
                for idx, item_json_str in enumerate(df_copy['amenities']):
                    current_row_amenities = set()
                    if isinstance(item_json_str, str):
                        try:
                            amenity_groups = json.loads(item_json_str.replace('\\"', '"').replace('\\\\', '\\'))
                            if isinstance(amenity_groups, list):
                                for group in amenity_groups:
                                    if isinstance(group, dict) and "items" in group and isinstance(group["items"],
                                                                                                   list):
                                        for amenity_item_dict in group["items"]:
                                            if isinstance(amenity_item_dict, dict) and "name" in amenity_item_dict:
                                                name_val = amenity_item_dict.get("name")
                                                if isinstance(name_val, str) and name_val.strip():
                                                    normalized = normalize_amenity(name_val)
                                                    if normalized: current_row_amenities.add(normalized)
                                            elif isinstance(amenity_item_dict, str) and amenity_item_dict.strip():
                                                normalized = normalize_amenity(amenity_item_dict)
                                                if normalized: current_row_amenities.add(normalized)
                            else:
                                parse_errors += 1
                        except (json.JSONDecodeError, Exception):
                            parse_errors += 1
                    elif pd.isna(item_json_str):
                        pass
                    else:
                        logger.warning(
                            f"Unexpected type in 'amenities' column at index {idx}: {type(item_json_str)}");
                        parse_errors += 1
                    extracted_amenities_from_complex.append(sorted(list(current_row_amenities)))
                if parse_errors > 0: logger.warning(
                    f"Encountered errors parsing 'amenities' (complex JSON) for {parse_errors} rows.")
                df_copy['amenities_list'] = extracted_amenities_from_complex
                logger.info("Created/Updated 'amenities_list' by parsing complex 'amenities' column.")
                if 'amenities_list_processed' in df_copy.columns: del df_copy['amenities_list_processed']
            elif 'amenities_list' not in df_copy.columns:
                logger.warning("No usable amenity column found. 'amenities_list' will be empty.")
                df_copy['amenities_list'] = [[] for _ in range(len(df_copy))]

            df_copy['amenities_list'] = df_copy['amenities_list'].apply(
                lambda x: sorted(list(set(x))) if isinstance(x, list) else [])
            df_copy['amenity_count'] = df_copy['amenities_list'].apply(len)
            logger.debug("Finalized 'amenities_list' and 'amenity_count'.")
            return df_copy

    def _clean_amenity_name_for_col(self, amenity_name: str) -> str:
        if not isinstance(amenity_name, str): return "amenity_invalid_name"
        cleaned = amenity_name.lower();
        cleaned = cleaned.replace('"', '').replace("'", "").replace("’", "").replace("–", "-");
        cleaned = cleaned.replace("...", "").replace("!", "")
        cleaned = re.sub(r'^[^\w]+|[^\w]+$', '', cleaned);
        cleaned = re.sub(r'[^\w\s-]+', '', cleaned);
        cleaned = re.sub(r'[\s-]+', '_', cleaned);
        cleaned = re.sub(r'_+', '_', cleaned);
        cleaned = cleaned.strip('_')
        return f"amenity_{cleaned}" if cleaned else "amenity_unknown"

    def calculate_market_feature_importance(self, target_variable='price', min_frequency=MIN_AMENITY_FREQUENCY) -> Dict[
        str, float]:
        if self.market_df is None or self.market_df.empty: logger.error("Market data not loaded."); return {}
        if target_variable not in self.market_df.columns: logger.error(
            f"Target variable '{target_variable}' not found."); return {}
        if 'amenities_list' not in self.market_df.columns: logger.error("'amenities_list' column not found."); return {}
        logger.info(f"Calculating dynamic feature importance for target '{target_variable}' using market data...");
        df = self.market_df.copy()
        df[target_variable] = pd.to_numeric(df[target_variable], errors='coerce');
        df = df.dropna(subset=[target_variable])
        if df.empty: logger.error(f"No valid market data after dropping NaNs in target '{target_variable}'."); return {}
        logger.info(f"Using {len(df)} market listings for importance calculation after dropping target NaNs.")
        numeric_features_base = ['bedrooms', 'bathrooms', 'guests', 'beds', 'ratings', 'amenity_count',
                                 'avg_occupancy'];
        valid_numeric_features = []
        for col in numeric_features_base:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if not df[col].isnull().all():
                    median_val = df[col].median(); fill_val = median_val if pd.notna(median_val) else 0.0; df.loc[
                        df[col].isnull(), col] = fill_val; valid_numeric_features.append(col); logger.debug(
                        f"Using numeric feature '{col}' (imputed NaNs with median: {fill_val:.2f}).")
                else:
                    logger.warning(f"Numeric feature '{col}' is entirely NaN in market data. Dropping.")
            else:
                logger.warning(f"Numeric feature '{col}' not found in market data.")
        if not valid_numeric_features: logger.warning(
            "No valid numeric features found in market data for importance calculation.")
        df['amenities_list_clean'] = df['amenities_list'].apply(
            lambda lst: [str(s).lower().strip() for s in lst if isinstance(s, str) and str(s).strip()] if isinstance(
                lst, list) else []);
        all_amenities_flat = [amenity for sublist in df['amenities_list_clean'] for amenity in sublist];
        amenity_counts = Counter(all_amenities_flat);
        frequent_amenities = {amenity for amenity, count in amenity_counts.items() if count >= min_frequency}
        logger.info(
            f"Found {len(amenity_counts)} unique amenities in market data, keeping {len(frequent_amenities)} appearing at least {min_frequency} times.")
        if not frequent_amenities: logger.error("No amenities meet the minimum frequency threshold."); return {}
        amenity_feature_map = {};
        amenity_feature_names = [];
        skipped_amenities = set();
        new_amenity_cols_data = {};
        df['amenities_set'] = df['amenities_list_clean'].apply(set);
        processed_col_names = set(df.columns)
        for amenity in frequent_amenities:
            col_name = self._clean_amenity_name_for_col(amenity)
            if col_name in processed_col_names:
                if amenity not in skipped_amenities: conflicting_amenity = [a for a, c in amenity_feature_map.items() if
                                                                            c == col_name]; logger.warning(
                    f"Amenity '{amenity}' generated conflicting column name '{col_name}' (possibly with '{conflicting_amenity}'). Skipping '{amenity}'."); skipped_amenities.add(
                    amenity)
                continue
            new_amenity_cols_data[col_name] = df['amenities_set'].apply(lambda s: amenity in s).astype(int);
            amenity_feature_map[amenity] = col_name;
            amenity_feature_names.append(col_name);
            processed_col_names.add(col_name)
        if not amenity_feature_names: logger.error(
            "Failed to create any valid amenity feature columns after cleaning and frequency check."); return {}
        if new_amenity_cols_data: amenity_features_df = pd.DataFrame(new_amenity_cols_data,
                                                                     index=df.index); df = pd.concat(
            [df, amenity_features_df], axis=1); logger.debug(
            f"Added {len(new_amenity_cols_data)} amenity feature columns to market DataFrame.")
        features = valid_numeric_features + amenity_feature_names;
        X = df[features];
        y = df[target_variable]
        if X.empty or len(X) < max(2, len(features) // 10): logger.error(
            f"Not enough data points ({len(X)}) after feature preparation for reliable regression."); return {}
        numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]);
        preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, valid_numeric_features)],
                                         remainder='passthrough');
        model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', Ridge(alpha=1.0))])
        try:
            logger.info(f"Training Ridge regression model with {len(features)} features on {len(X)} samples...");
            model_pipeline.fit(X, y);
            logger.info("Regression model trained successfully.")
            regressor = model_pipeline.named_steps['regressor'];
            coefficients = regressor.coef_
            try:
                processed_feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()
            except AttributeError:
                logger.warning(
                    "Could not use get_feature_names_out. Reconstructing feature names."); processed_feature_names = valid_numeric_features + amenity_feature_names
            if len(coefficients) != len(processed_feature_names): logger.error(
                f"Coefficient count ({len(coefficients)}) != processed feature name count ({len(processed_feature_names)})."); logger.debug(
                f"Coeffs: {coefficients}"); logger.debug(f"Names: {processed_feature_names}"); return {}
            importances_raw = dict(zip(processed_feature_names, coefficients));
            final_amenity_importances = {};
            reverse_amenity_feature_map = {v: k for k, v in amenity_feature_map.items()}
            for feature_name, score in importances_raw.items():
                original_name = None;
                cleaned_feature_name = feature_name.split('__')[-1]
                if cleaned_feature_name in reverse_amenity_feature_map: original_name = reverse_amenity_feature_map[
                    cleaned_feature_name]
                if original_name: final_amenity_importances[original_name] = round(score, 2)
            sorted_amenity_importances = dict(
                sorted(final_amenity_importances.items(), key=lambda item: abs(item[1]), reverse=True));
            logger.info(f"Calculated dynamic importance scores for {len(sorted_amenity_importances)} amenities.")
            logger.debug(
                f"Top 5 amenity importances (abs value): {dict(list(sorted_amenity_importances.items())[:5])}");
            logger.debug(
                f"Bottom 5 amenity importances (abs value): {dict(list(sorted_amenity_importances.items())[-5:])}");
            return final_amenity_importances
        except Exception as e:
            logger.error(f"Error during model training or importance extraction: {e}", exc_info=True); return {}

    def enrich_data(self, df: pd.DataFrame) -> bool:
        if df is None: logger.warning("Cannot enrich data: df is None."); return False
        logger.info("Enriching data...");
        try:
            if 'price' in df.columns and 'bedrooms' in df.columns:
                df['price'] = pd.to_numeric(df['price'], errors='coerce');
                df['bedrooms'] = pd.to_numeric(df['bedrooms'], errors='coerce')
                df['price_per_bedroom'] = df.apply(
                    lambda r: r['price'] / r['bedrooms'] if pd.notna(r['bedrooms']) and r['bedrooms'] > 0 and pd.notna(
                        r['price']) else (r['price'] if pd.notna(r['price']) else np.nan), axis=1)
                logger.debug("Calculated 'price_per_bedroom'.")

            if 'price' in df.columns and 'guests' in df.columns:
                df['price'] = pd.to_numeric(df['price'], errors='coerce');
                df['guests'] = pd.to_numeric(df['guests'], errors='coerce')
                df['price_per_guest'] = df.apply(
                    lambda r: r['price'] / r['guests'] if pd.notna(r['guests']) and r['guests'] > 0 and pd.notna(
                        r['price']) else (r['price'] if pd.notna(r['price']) else np.nan), axis=1)
                logger.debug("Calculated 'price_per_guest'.")

            if 'amenities_list' in df.columns:
                df['amenities_list'] = df['amenities_list'].apply(lambda x: x if isinstance(x, list) else [])
                df['amenity_count'] = df['amenities_list'].apply(len)
            elif 'amenity_count' not in df.columns:
                df['amenity_count'] = 0

            occupancy_cols = [col for col in df.columns if col.startswith('occupancy_')]
            if occupancy_cols:
                valid_occ_cols = [col for col in occupancy_cols if pd.api.types.is_numeric_dtype(df[col])]
                if valid_occ_cols:
                    df['avg_occupancy'] = df[valid_occ_cols].mean(axis=1, skipna=True)
                    logger.debug(f"Calculated 'avg_occupancy' using columns: {valid_occ_cols}.")
                else:
                    if 'avg_occupancy' not in df.columns: df['avg_occupancy'] = np.nan
            else:
                if 'avg_occupancy' not in df.columns: df['avg_occupancy'] = np.nan

            logger.info("Data enrichment completed successfully.");
            return True
        except Exception as e:
            logger.error(f"Error enriching data: {e}", exc_info=True);
            return False

    def clean_data(self, df: pd.DataFrame) -> bool:
        if df is None: logger.warning("Cannot clean data: df is None."); return False
        logger.info("Cleaning data...");
        try:
            numeric_cols = df.select_dtypes(include=np.number).columns
            for col in numeric_cols:
                if df[col].isnull().any():
                    median_val = df[col].median(skipna=True)
                    fill_val = median_val if pd.notna(median_val) else 0
                    df[col].fillna(fill_val, inplace=True)
                    if pd.notna(median_val):
                        logger.debug(f"Filled NaNs in '{col}' with median ({median_val:.2f}).")
                    else:
                        logger.warning(f"Could not calculate median for '{col}'. Filled NaNs with 0.")

            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
            for col in categorical_cols:
                if df[col].isnull().any():
                    df[col].fillna("Unknown", inplace=True)
                    logger.debug(f"Filled NaNs in categorical column '{col}' with 'Unknown'.")

            if 'price' in df.columns and pd.api.types.is_numeric_dtype(df['price']):
                Q1 = df['price'].quantile(0.25);
                Q3 = df['price'].quantile(0.75);
                IQR = Q3 - Q1
                if pd.notna(IQR) and IQR > 0:
                    lower_bound = max(0, Q1 - 1.5 * IQR);
                    upper_bound = Q3 + 1.5 * IQR
                    outlier_mask = (df['price'] < lower_bound) | (df['price'] > upper_bound)
                    if outlier_mask.any():
                        logger.info(f"Identified {outlier_mask.sum()} price outliers.")
                        df['price_outlier_flag'] = outlier_mask
                        df['price_capped'] = df['price'].clip(lower=lower_bound, upper=upper_bound)
                    else:
                        df['price_outlier_flag'] = False;
                        df['price_capped'] = df['price']
                else:
                    df['price_outlier_flag'] = False;
                    df['price_capped'] = df['price']

            logger.info("Data cleaning completed successfully.");
            return True
        except Exception as e:
            logger.error(f"Error cleaning data: {e}", exc_info=True);
            return False

class PromptManager:
    """Manages prompt templates and customization for the LLM"""
    def __init__(self):
        self.templates: Dict[str, str] = {}
        self.default_params: Dict[str, Any] = {"listing_id": "N/A", "property_type": "Unknown", "location": "Unknown",
                                               "bedrooms": 0, "bathrooms": 0.0, "guests": 0, "beds": 0, "price": 0.0,
                                               "ratings": 0.0, "lat": 0.0, "long": 0.0, "amenities_preview": "N/A",
                                               "amenities_list": [], "amenity_count": 0, "occupancy_recent": 0.0,
                                               "occupancy_extended": 0.0, "avg_occupancy": 0.0,
                                               "price_per_bedroom": 0.0, "price_per_guest": 0.0,
                                               "market_avg_price": 0.0, "market_median_price": 0.0,
                                               "market_avg_rating": 0.0, "market_avg_occupancy": 0.0,
                                               "market_price_per_bedroom": 0.0, "market_price_per_guest": 0.0,
                                               "market_similar_listings_count": 0, "vacancy_rate": 0.0,
                                               "upcoming_events": 0, "event_radius": 25, "market_trend": "Stable",
                                               "seasonal_factors": "Normal", "similar_competitors": 0,
                                               "balance_strategy": "Balanced Revenue/Occupancy", "target_roi": 15.0,
                                               "market_trend_source": "N/A", "fred_hpi_trend": "Unknown",
                                               "fred_mortgage_rate_level": "Unknown",
                                               "present_amenity_scores_summary": "Analysis N/A",
                                               "missing_valuable_amenities_summary": "Analysis N/A"}
        self.prompt_history: List[Dict] = []
        self.prompt_performance: Dict[str, Dict] = {}
        self.add_template("detailed_v3", """You are an AI pricing optimization assistant for short-term rentals (STR). Your goal is to recommend an optimal *nightly price* for the provided listing, critically evaluating **all** provided data points, including calculated amenity importance scores derived from market data analysis.

    **Analyze the following information comprehensively:**

    **LISTING DETAILS:**
    - Listing ID: {listing_id}
    - Property Type: {property_type}
    - Location: {location} (Coordinates: {lat:.4f}, {long:.4f})
    - Base Price: ${price:.2f}
    - Bedrooms: {bedrooms}
    - Bathrooms: {bathrooms}
    - Sleeps: {guests} guests
    - Overall Rating: {ratings:.1f} / 5.0
    - Total Amenities Provided: {amenity_count}
    - Amenities Preview: {amenities_preview}
    - **Present Amenities (w/ Market Price Impact Score):** {present_amenity_scores_summary}  # Score indicates approx. $ price change associated with amenity based on market model, controlling for other factors. Positive=higher price, Negative=lower price.
    - **Missing Valuable Amenities (w/ Market Price Impact Score):** {missing_valuable_amenities_summary} # Lists missing amenities that have a significant positive price impact score (> $5.00) in the market model.
    - Recent Occupancy (if available): {occupancy_recent:.1f}%
    - Extended Occupancy (if available): {occupancy_extended:.1f}%
    - Average Occupancy (calculated): {avg_occupancy:.1f}%
    - Calculated Price/Bedroom: ${price_per_bedroom:.2f}
    - Calculated Price/Guest: ${price_per_guest:.2f}

    **MARKET COMPARISON (Nearby/Similar Listings):**
    - Compared Listings Count: {market_similar_listings_count}
    - Average Price: ${market_avg_price:.2f}
    - Median Price: ${market_median_price:.2f}
    - Average Rating: {market_avg_rating:.1f} / 5.0
    - Average Occupancy: {market_avg_occupancy:.1f}%
    - Avg Price/Bedroom: ${market_price_per_bedroom:.2f}
    - Avg Price/Guest: ${market_price_per_guest:.2f}

    **EXTERNAL FACTORS & CONDITIONS:**
    - Local Area Vacancy Rate: {vacancy_rate:.1f}%
    - Upcoming Events (90d within {event_radius} miles): {upcoming_events}
    - Estimated Market Trend (from {market_trend_source}): {market_trend}
    - FRED Housing Price Index Trend (National): {fred_hpi_trend}
    - FRED Mortgage Rate Level (30yr Fixed): {fred_mortgage_rate_level}
    - Current Seasonality: {seasonal_factors}

    **OPTIMIZATION GOAL:**
    - Target Return on Investment (ROI): {target_roi:.1f}% (Guideline)
    - Pricing Strategy Focus: {balance_strategy}

    **TASK:**
    Based *only* on the information provided above, recommend an optimal *single nightly price* for this listing.

    **Your reasoning MUST be detailed and explicitly address the following points, referencing the specific data provided:**
    1.  **Market Positioning:** Justify the recommended price relative to the market average (${market_avg_price:.2f}) and median (${market_median_price:.2f}). How much premium or discount is warranted based on the listing's rating ({ratings:.1f}), bedrooms ({bedrooms}), bathrooms ({bathrooms}), and guest capacity ({guests}) compared to market averages?
    2.  **Amenity Impact (Using Scores):** Analyze the **Present Amenities & Scores** ({present_amenity_scores_summary}). How do amenities with significant positive scores (e.g., > +$10) support a higher price? How do amenities with negative scores detract? Analyze the **Missing Valuable Amenities** ({missing_valuable_amenities_summary}). Does the absence of amenities with high positive scores limit the potential price premium? State *which specific* amenities and their associated scores most influence your decision and *why*.
    3.  **Occupancy Analysis:** How do the listing's own occupancy rates (avg: {avg_occupancy:.1f}%) and the market occupancy ({market_avg_occupancy:.1f}%) influence the price recommendation? Consider demand implications.
    4.  **External Factor Synthesis:** Explain the *combined* influence of Events ({upcoming_events}), Market Trend ({market_trend}), FRED data ({fred_hpi_trend}, {fred_mortgage_rate_level}), Vacancy Rate ({vacancy_rate:.1f}%), and Seasonality ({seasonal_factors}) on the price.
    5.  **ROI Alignment:** Briefly explain how the recommended price relates to the {target_roi:.1f}% ROI target and the chosen strategy ({balance_strategy}). Is it achievable given the market context?
    6.  **Overall Justification:** Summarize the primary reasons for the specific recommended price compared to the current base price (${price:.2f}), integrating the quantitative amenity scores, core features (beds/baths/guests/rating), market position, and external factors.

    Provide a confidence score (0.0-1.0) reflecting certainty in the recommendation based *only* on the provided data, and a plausible nightly price range (min/max) derived from your analysis.

    **CRITICAL: Your *entire* response MUST be ONLY the JSON object specified below, starting with `{{` and ending with `}}`. Do not include any other text, explanations, or markdown formatting outside the JSON structure.**

    **OUTPUT FORMAT (Strict JSON only):**
    ```json
    {{
      "recommended_price": float,
      "reasoning": "string (Detailed explanation covering all 6 points above, referencing specific data values and amenity scores)",
      "confidence": float (0.0-1.0),
      "primary_factors": [
          "string (List 3-5 key factors with their specific influence, e.g., 'High rating ({ratings:.1f}) justifies premium over market avg (${market_avg_price:.2f})', 'Missing valuable amenity: Pool (Score: +$35.00) limits premium', 'Present valuable amenity: Hot Tub (Score: +$22.50) supports price', 'High event count ({upcoming_events}) boosts demand')"
        ],
      "price_range_suggestion": {{
          "min": float,
          "max": float
       }}
    }}
    ```""")
        self.load_templates(PROMPT_TEMPLATE_FILE, silent=True)

    def get_prompt(self, template_name: str, **kwargs) -> Tuple[str, int]:
        if template_name not in self.templates: logger.warning(
            f"Template '{template_name}' not found, using best or default."); template_name = self.get_best_template(
            fallback='detailed_v3')
        if template_name not in self.templates: template_name = next(iter(self.templates), None);
        if not template_name:
            logger.critical("CRITICAL: No prompt templates available!"); return "Error: No prompt templates loaded.", -1
        else:
            logger.warning(f"Using first available template as fallback: '{template_name}'")
        template = self.templates[template_name];
        params = self.default_params.copy()
        for key, value in kwargs.items():
            if key in params:
                expected_type = type(params[key])
                if value is None:
                    params[key] = None
                elif not isinstance(value, expected_type):
                    try:
                        if expected_type is float and isinstance(value, int):
                            params[key] = float(value)
                        elif expected_type is int and isinstance(value, float):
                            params[key] = int(round(value))
                        elif expected_type is bool:
                            params[key] = value.lower() in ['true', '1', 't', 'y', 'yes'] if isinstance(value,
                                                                                                        str) else bool(
                                value)
                        else:
                            params[key] = expected_type(value)
                    except (ValueError, TypeError) as e:
                        logger.warning(
                            f"Could not cast provided value for '{key}' (value: '{value}', type: {type(value)}) to expected type {expected_type}. Using default. Error: {e}")
                else:
                    params[key] = value
            else:
                params[key] = value; logger.debug(f"Added non-default parameter '{key}' to prompt context.")
        for key, default_val in self.default_params.items():
            if params.get(key) is None and default_val is not None: logger.debug(
                f"Parameter '{key}' was None, reverting to default: {default_val}"); params[key] = default_val
        if 'amenities_list' in params and isinstance(params['amenities_list'], list):
            preview_list = [str(a) for a in params['amenities_list'][:5] if isinstance(a, (str, int, float))]; params[
                'amenities_preview'] = ', '.join(preview_list) + ('...' if len(params['amenities_list']) > 5 else '')
        else:
            params['amenities_preview'] = "N/A"
        try:
            prompt = template.format(**params);
            prompt_id = len(self.prompt_history)
            self.prompt_history.append(
                {"id": prompt_id, "template": template_name, "parameters_used": kwargs, "full_parameters": params,
                 "prompt": prompt, "timestamp": datetime.now().isoformat(), "performance": None})
            logger.debug(f"Generated prompt ID {prompt_id} using template '{template_name}'.");
            return prompt, prompt_id
        except KeyError as e:
            logger.error(f"CRITICAL: Missing parameter {{'{e}'}} required by template '{template_name}'.");
            logger.debug(f"Available parameters: {list(params.keys())}");
            simple_template_name = 'simple_v1'
            if simple_template_name in self.templates and template_name != simple_template_name:
                logger.warning(f"Attempting fallback to template '{simple_template_name}'.")
                try:
                    simple_template = self.templates[simple_template_name];
                    simple_params = {k: params.get(k, self.default_params.get(k)) for k in self.default_params if
                                     f"{{{k}" in simple_template}
                    prompt = simple_template.format(**simple_params);
                    prompt_id = len(self.prompt_history)
                    self.prompt_history.append(
                        {"id": prompt_id, "template": simple_template_name, "parameters_used": kwargs,
                         "full_parameters": simple_params, "prompt": prompt, "timestamp": datetime.now().isoformat(),
                         "performance": None})
                    logger.info(
                        f"Successfully generated prompt ID {prompt_id} using fallback template '{simple_template_name}'.");
                    return prompt, prompt_id
                except Exception as fallback_e:
                    logger.critical(
                        f"Fallback prompt generation also failed for template '{simple_template_name}': {fallback_e}"); return f"Critical Error: Failed to format primary prompt (missing key: {e}) and fallback prompt.", -1
            else:
                return f"Critical Error: Failed to format prompt for template '{template_name}' (missing key: {e}). No fallback available.", -1
        except Exception as e:
            logger.critical(f"Unexpected error formatting prompt template '{template_name}': {e}",
                            exc_info=True); return f"Critical Error: Unexpected prompt formatting error: {e}", -1

    def record_performance(self, prompt_id: int, metrics: Dict[str, Any]) -> None:
        if not (0 <= prompt_id < len(self.prompt_history)): logger.error(
            f"Invalid prompt ID {prompt_id} for performance recording."); return
        if not isinstance(metrics, dict): logger.error(f"Invalid metrics format for prompt ID {prompt_id}."); return
        self.prompt_history[prompt_id]["performance"] = metrics;
        logger.debug(f"Recorded performance for prompt ID {prompt_id}")
        template_name = self.prompt_history[prompt_id]["template"]
        if template_name not in self.prompt_performance: self.prompt_performance[template_name] = {"count": 0,
                                                                                                   "sum_confidence": 0.0,
                                                                                                   "successful_parses": 0,
                                                                                                   "sum_response_time_s": 0.0,
                                                                                                   "recommendation_provided": 0,
                                                                                                   "errors": 0}
        perf = self.prompt_performance[template_name];
        perf["count"] += 1;
        perf["sum_confidence"] += float(metrics.get("confidence", 0.0) or 0.0);
        perf["sum_response_time_s"] += float(metrics.get("response_time_s", 0.0) or 0.0)
        if metrics.get("success", False): perf["successful_parses"] += 1
        if metrics.get("has_recommendation", False): perf["recommendation_provided"] += 1
        if metrics.get("error") is not None: perf["errors"] += 1

    def get_best_template(self, fallback: str = 'detailed_v3') -> str:
        if not self.prompt_performance: logger.info(
            f"No prompt performance data, returning fallback template '{fallback}'."); return fallback if fallback in self.templates else next(
            iter(self.templates), fallback)
        best_template = fallback;
        best_score = -1.0;
        logger.debug(f"Evaluating prompt templates: {list(self.prompt_performance.keys())}")
        for template, metrics in self.prompt_performance.items():
            count = metrics.get("count", 0);
            if count == 0: continue
            parse_success_rate = metrics.get("successful_parses", 0) / count;
            recommendation_rate = metrics.get("recommendation_provided", 0) / count;
            error_rate = metrics.get("errors", 0) / count;
            avg_confidence = metrics.get("sum_confidence", 0.0) / count
            score = (0.4 * recommendation_rate) + (0.3 * avg_confidence) + (0.2 * parse_success_rate) - (
                        0.1 * error_rate)
            logger.debug(
                f"Template '{template}': Score={score:.3f} (RecRate={recommendation_rate:.2f}, AvgConf={avg_confidence:.2f}, ParseRate={parse_success_rate:.2f}, ErrRate={error_rate:.2f})")
            if score > best_score:
                if template in self.templates:
                    best_score = score; best_template = template
                else:
                    logger.warning(
                        f"Template '{template}' scored highest ({score:.3f}) but is not currently loaded. Ignoring.")
        if best_template not in self.templates: logger.warning(
            f"Best performing template '{best_template}' not found. Reverting to fallback '{fallback}'."); return fallback if fallback in self.templates else next(
            iter(self.templates), fallback)
        logger.info(f"Best performing template selected: '{best_template}' with score {best_score:.3f}");
        return best_template

    def add_template(self, name: str, template: str) -> bool:
        if not isinstance(name, str) or not name: logger.error("Template name must be non-empty string."); return False
        if not isinstance(template, str) or not template: logger.error(
            "Template content must be non-empty string."); return False
        if name in self.templates: logger.warning(f"Template '{name}' already exists, will be overwritten.")
        try:
            dummy_params = self.default_params.copy();
            placeholders = set(re.findall(r"\{([a-zA-Z_][a-zA-Z0-9_]*)(?:[:!][^}]+)?\}", template))
            missing_defaults = placeholders - set(dummy_params.keys())
            if missing_defaults: logger.debug(
                f"Template '{name}' uses placeholders without defaults: {missing_defaults}."); [
                dummy_params.update({key: f"<{key}_placeholder>"}) for key in missing_defaults]
            try:
                template.format(**dummy_params)
            except (ValueError, KeyError, TypeError) as fmt_err:
                logger.warning(f"Template '{name}' format test raised error: {fmt_err}. Adding anyway.")
            self.templates[name] = template;
            logger.info(f"Added/Updated template: '{name}'");
            return True
        except Exception as e:
            logger.error(f"Error adding template '{name}': {e}", exc_info=True); return False

    def save_templates(self, file_path: str = PROMPT_TEMPLATE_FILE) -> bool:
        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(self.templates, f, indent=2, ensure_ascii=False)
            logger.info(f"Prompt templates saved to {file_path}");
            return True
        except IOError as e:
            logger.error(f"I/O error saving templates to {file_path}: {e}"); return False
        except Exception as e:
            logger.error(f"Unexpected error saving templates: {e}", exc_info=True); return False

    def load_templates(self, file_path: str = PROMPT_TEMPLATE_FILE, silent: bool = False) -> bool:
        if not os.path.exists(file_path):
            if not silent: logger.info(
                f"Template file {file_path} not found. Using built-in templates only."); return False
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                loaded_templates = json.load(f)
            if not isinstance(loaded_templates, dict): logger.error(
                f"Invalid format in template file {file_path}."); return False
            valid_loaded = 0
            for name, template in loaded_templates.items():
                if self.add_template(name, template): valid_loaded += 1
            if not silent: logger.info(
                f"Loaded/Updated {valid_loaded}/{len(loaded_templates)} templates from {file_path}. Total templates now: {len(self.templates)}"); return True
        except json.JSONDecodeError as e:
            logger.error(f"JSON decode error loading templates from {file_path}: {e}"); return False
        except IOError as e:
            logger.error(f"I/O error loading templates from {file_path}: {e}"); return False
        except Exception as e:
            logger.error(f"Unexpected error loading templates: {e}", exc_info=True); return False

    def export_prompt_history(self, file_path: str = PROMPT_HISTORY_FILE) -> bool:
        if not self.prompt_history: logger.info("Prompt history is empty."); return False
        try:
            history_data = []
            for entry in self.prompt_history:
                row = {"prompt_id": entry.get("id"), "template": entry.get("template"),
                       "timestamp": entry.get("timestamp"), "prompt_length": len(entry.get("prompt", "")),
                       "input_listing_id": entry.get("full_parameters", {}).get("listing_id"),
                       "input_price": entry.get("full_parameters", {}).get("price"),
                       "input_target_roi": entry.get("full_parameters", {}).get("target_roi"),
                       "perf_success_parse": None, "perf_recommendation_provided": None, "perf_confidence": None,
                       "perf_response_time_s": None, "perf_recommended_price": None, "perf_error": None}
                if entry.get("performance") and isinstance(entry["performance"], dict):
                    perf = entry["performance"];
                    row["perf_success_parse"] = perf.get("success");
                    row["perf_recommendation_provided"] = perf.get("has_recommendation");
                    row["perf_confidence"] = perf.get("confidence");
                    row["perf_response_time_s"] = perf.get("response_time_s");
                    row["perf_recommended_price"] = perf.get("recommended_price");
                    row["perf_error"] = perf.get("error")
                history_data.append(row)
            if not history_data: logger.warning("No processable data in prompt history."); return False
            df = pd.DataFrame(history_data);
            columns_order = ["prompt_id", "timestamp", "template", "input_listing_id", "input_price",
                             "input_target_roi", "prompt_length", "perf_success_parse", "perf_recommendation_provided",
                             "perf_confidence", "perf_response_time_s", "perf_recommended_price", "perf_error"]
            for col in columns_order:
                if col not in df.columns: df[col] = None
            df = df[columns_order]
            for col in df.columns:
                if df[col].apply(lambda x: isinstance(x, (list, dict))).any(): df[col] = df[col].astype(str)
            df.to_csv(file_path, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8');
            logger.info(f"Prompt history exported to {file_path}");
            return True
        except Exception as e:
            logger.error(f"Error exporting prompt history: {e}", exc_info=True); return False

class LLMAgent:
    """Agent using OpenRouter for pricing optimization with prompt management and caching."""
    def __init__(self, model: str = OPENROUTER_MODEL_NAME, prompt_manager: PromptManager = None):
        self.model = model
        self.prompt_manager = prompt_manager if prompt_manager else PromptManager()
        self.response_cache: Dict[str, Dict] = {}
        self.response_quality_log: List[Dict] = []
        self.openrouter_api_key = os.environ.get("OPENROUTER_API_KEY")
        if not self.openrouter_api_key:
            logger.critical("OPENROUTER_API_KEY not found. LLM Agent will not function.")
        else:
            logger.info(f"LLM Agent initialized for OpenRouter with model: '{self.model}'.")
        self.load_cache(LLM_CACHE_FILE, silent=True)

    def _clean_llm_output(self, raw_output: str) -> str:
        if not isinstance(raw_output, str): logger.warning("Received non-string output from LLM."); return ""
        logger.debug(f"Raw LLM Output (first 500 chars):\n{raw_output[:500]}...");
        cleaned = re.sub(r'<think>.*?</think>', '', raw_output, flags=re.DOTALL | re.IGNORECASE)
        if len(cleaned) < len(raw_output): logger.debug("Removed <think> tags.")
        cleaned = re.sub(r'^```(?:json|JSON)?\s*', '', cleaned.strip(), flags=re.MULTILINE);
        cleaned = re.sub(r'\s*```$', '', cleaned.strip(), flags=re.MULTILINE)
        if len(cleaned.strip()) < len(raw_output.strip()) and '<think>' not in raw_output: logger.debug(
            "Removed markdown code fences.")
        cleaned = cleaned.strip();
        json_match = re.search(r'\{.*\}', cleaned, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
            if json_str.startswith('{') and json_str.endswith('}'):
                try:
                    json.loads(json_str); logger.debug("Extracted valid JSON block using regex."); return json_str
                except json.JSONDecodeError:
                    logger.warning("Regex found block, but invalid JSON. Returning cleaned string.")
            else:
                logger.debug("Regex match didn't look like JSON. Returning cleaned string.")
        else:
            logger.debug("No JSON block found using regex. Returning cleaned string.")
        return cleaned

    def _parse_recommendation(self, llm_response_content: str) -> Dict[str, Any]:
        cleaned_content = self._clean_llm_output(llm_response_content);
        result = {"recommended_price": None, "reasoning": cleaned_content, "confidence": 0.0, "success": False,
                  "has_recommendation": False, "error": None, "primary_factors": [],
                  "price_range_suggestion": {"min": None, "max": None}}
        try:
            if cleaned_content.startswith('{') and cleaned_content.endswith('}'):
                parsed_json = json.loads(cleaned_content)
                if isinstance(parsed_json, dict):
                    rec_price_raw = parsed_json.get("recommended_price");
                    reasoning_raw = parsed_json.get("reasoning");
                    confidence_raw = parsed_json.get("confidence");
                    factors_raw = parsed_json.get("primary_factors");
                    range_raw = parsed_json.get("price_range_suggestion")
                    if rec_price_raw is not None:
                        try:
                            price = float(rec_price_raw); result["recommended_price"] = price if price > 0 else None;
                            result["has_recommendation"] = price > 0
                        except (ValueError, TypeError):
                            logger.warning(f"Could not convert recommended_price '{rec_price_raw}' to float.")
                    if isinstance(reasoning_raw, str) and reasoning_raw.strip():
                        result["reasoning"] = reasoning_raw.strip()
                    elif result["has_recommendation"]:
                        result["reasoning"] = "[Reasoning missing in JSON response]"
                    if confidence_raw is not None:
                        try:
                            conf = float(confidence_raw); result["confidence"] = max(0.0, min(1.0, conf))
                        except (ValueError, TypeError):
                            logger.warning(f"Could not convert confidence '{confidence_raw}' to float.")
                    if isinstance(factors_raw, list): result["primary_factors"] = [str(f) for f in factors_raw if
                                                                                   f is not None]
                    if isinstance(range_raw, dict):
                        min_p_raw, max_p_raw = range_raw.get("min"), range_raw.get("max");
                        min_p, max_p = None, None
                        try:
                            if min_p_raw is not None: min_p = float(min_p_raw)
                            if max_p_raw is not None: max_p = float(max_p_raw)
                            if min_p is not None and max_p is not None and min_p > max_p: min_p, max_p = max_p, min_p
                            if min_p is not None and min_p < 0: min_p = 0.0
                            if max_p is not None and max_p < 0: max_p = 0.0
                            result["price_range_suggestion"]["min"] = min_p;
                            result["price_range_suggestion"]["max"] = max_p
                        except (ValueError, TypeError):
                            logger.warning(f"Could not convert price range values '{range_raw}' to floats.")
                    if result["has_recommendation"]:
                        result["success"] = True; logger.info(
                            f"Successfully parsed JSON. Price: {result['recommended_price']}, Conf: {result['confidence']:.2f}")
                    else:
                        logger.warning("Parsed JSON, but failed to extract a valid positive recommended_price.");
                        result["error"] = "JSON parsed but no valid price found"
                    return result
                else:
                    logger.warning("Parsed JSON is not a dictionary."); result[
                        "error"] = "Parsed JSON is not a dictionary"
        except json.JSONDecodeError as json_err:
            logger.warning(f"Failed to parse cleaned output as JSON: {json_err}. Content: {cleaned_content[:200]}...");
            result["error"] = f"JSONDecodeError: {json_err}"
        except Exception as e:
            logger.error(f"Unexpected error during JSON parsing: {e}", exc_info=True); result[
                "error"] = f"Unexpected JSON parsing error: {e}"
        if not result["has_recommendation"]:
            logger.debug("Attempting regex fallback to find price.");
            price_found_by_regex = None
            context_patterns = [
                r'(?:recommend(?:ed)?|optimal|suggested)\s+(?:price|rate)\s*(?:is|:)?\s*\$?(\b\d{1,5}(?:[.,]\d{1,2})?\b)',
                r'\$?(\b\d{1,5}(?:[.,]\d{1,2})?\b)\s*(?:is|as)\s+the\s+(?:recommend(?:ed)?|optimal|suggested)\s+(?:price|rate)']
            for pattern in context_patterns:
                match = re.search(pattern, cleaned_content, re.IGNORECASE)
                if match:
                    try:
                        price_str = match.group(1).replace(',', ''); price = float(
                            price_str); price_found_by_regex = price if price > 0 else None; logger.info(
                            f"Extracted price {price_found_by_regex} using context regex."); break
                    except (ValueError, TypeError, IndexError):
                        logger.warning(
                            f"Context regex pattern '{pattern}' matched, but failed extraction from group 1: '{match.group(1)}'")
            if price_found_by_regex is None:
                if "recommended_price" in cleaned_content.lower():  # Only search if the keyword is present
                    general_matches = re.findall(r'\$?(\b\d{2,5}(?:[.,]\d{1,2})?\b)', cleaned_content)
                    if general_matches:
                        potential_prices = [float(p.replace(',', '')) for p in general_matches if
                                            float(p.replace(',', '')) > 0]
                        if potential_prices:
                            # Instead of max(), take the first one found after the keyword
                            price_found_by_regex = potential_prices[0]
                            logger.info(
                                f"Extracted price {price_found_by_regex} using a conservative general regex.")

            if price_found_by_regex is not None: result["recommended_price"] = price_found_by_regex; result[
                "confidence"] = 0.5; result["reasoning"] += " [Price extracted via regex fallback]"; result[
                "success"] = True; result["has_recommendation"] = True; result["error"] = None; return result
        logger.warning(f"Could not extract recommended price via JSON or regex.");
        if not result["error"]: result["error"] = "Could not parse JSON or find price via regex."
        result["success"] = False;
        result["has_recommendation"] = False;
        result["recommended_price"] = None;
        result["confidence"] = 0.0
        return result

    def _detect_seasonal_factors(self) -> str:
        month = datetime.now().month
        if month in [6, 7, 8]: return "Peak Summer Season"
        if month in [12, 1, 2]: return "Winter/Off-Peak Season"
        if month in [3, 4, 5]: return "Spring Shoulder Season"
        if month in [9, 10, 11]: return "Autumn Shoulder Season"
        return "Normal Season"

    def _generate_cache_key(self, listing_data: Dict, market_data: Dict, external_data: Dict, target_roi: float,
                                template: str) -> str:

            def safe_format(value, default_val, format_spec):
                val_to_format = value if value is not None else default_val
                return f"{val_to_format:{format_spec}}"

            key_listing = {
                "id": listing_data.get("id", "N/A"),
                "price": safe_format(listing_data.get('price'), 0.0, ".2f"),
                "bedrooms": listing_data.get("bedrooms"),
                "bathrooms": listing_data.get("bathrooms"),
                "guests": listing_data.get("guests"),
                "ratings": safe_format(listing_data.get('ratings'), 0.0, ".1f"),
                "avg_occupancy": safe_format(listing_data.get('avg_occupancy'), 0.0, ".1f"),
                "amenity_count": listing_data.get("amenity_count", 0)
            }

            key_market = {
                "avg_price": safe_format(market_data.get('market_avg_price'), 0.0, ".2f"),
                "median_price": safe_format(market_data.get('market_median_price'), 0.0, ".2f"),
                "avg_occupancy": safe_format(market_data.get('market_avg_occupancy'), 0.0, ".1f"),
                "count": market_data.get("market_similar_listings_count", 0)
            }

            key_external = {
                "events": external_data.get("upcoming_events", 0),
                "trend": external_data.get("market_trend", "Stable"),
                "vacancy": safe_format(external_data.get('vacancy_rate'), 0.0, ".1f"),
                "hpi_trend": external_data.get("fred_hpi_trend", "Unknown"),
                "mortgage_level": external_data.get("fred_mortgage_rate_level", "Unknown")
            }

            combined = {"listing": key_listing, "market": key_market, "external": key_external,
                        "target_roi": f"{target_roi:.2f}", "template": template, "model": self.model}
            return json.dumps(combined, sort_keys=True)

    def get_pricing_recommendation(self, listing_data: Dict, market_data: Dict, external_data: Dict, target_roi: float,
                                   use_cache: bool = True, template: str = None,
                                   prompt_extra_data: Optional[Dict] = None) -> Dict:
        """Generates a pricing recommendation using the LLM (synchronous)."""
        start_time = time.time()
        if not self.openrouter_api_key:
            logger.critical("OpenRouter API key not available.")
            return {"recommended_price": None, "reasoning": "OpenRouter API key not configured", "confidence": 0.0,
                    "success": False, "has_recommendation": False, "error": "OpenRouter API key missing",
                    "response_time_s": time.time() - start_time, "primary_factors": [],
                    "price_range_suggestion": {"min": None, "max": None}, "cached": False, "prompt_sent": "N/A"}

        selected_template = template or self.prompt_manager.get_best_template()
        listing_id_log = listing_data.get('id', listing_data.get('listing_id', 'N/A'))

        market_data_prefixed = {f"market_{k}": v for k, v in market_data.items()}
        all_prompt_params = {**listing_data, **market_data_prefixed, **external_data, 'target_roi': target_roi}
        all_prompt_params.setdefault('seasonal_factors', self._detect_seasonal_factors())
        all_prompt_params.setdefault('balance_strategy', "Balanced Revenue/Occupancy")
        all_prompt_params.setdefault('event_radius', 25)
        if prompt_extra_data: all_prompt_params.update(prompt_extra_data)

        prompt_content, prompt_id = self.prompt_manager.get_prompt(selected_template, **all_prompt_params)

        if prompt_id == -1:
            logger.critical(f"Failed to generate prompt for listing '{listing_id_log}'. Error: {prompt_content}")
            return {"recommended_price": None, "reasoning": "Prompt generation failed", "confidence": 0.0,
                    "success": False, "has_recommendation": False,
                    "error": f"Prompt generation failed: {prompt_content}", "response_time_s": time.time() - start_time,
                    "primary_factors": [], "price_range_suggestion": {"min": None, "max": None}, "cached": False,
                    "prompt_sent": prompt_content}

        cache_key = self._generate_cache_key(listing_data, market_data_prefixed, external_data, target_roi,
                                             selected_template)

        if use_cache and cache_key in self.response_cache:
            cached_result = self.response_cache[cache_key].copy()
            cached_result.setdefault("response_time_s", 0)
            cached_result.setdefault("success", cached_result.get("recommended_price") is not None)
            cached_result.setdefault("has_recommendation", cached_result.get("recommended_price") is not None)
            cached_result.setdefault("error", None)
            cached_result.setdefault("primary_factors", [])
            cached_result.setdefault("price_range_suggestion", {"min": None, "max": None})
            logger.info(f"Using cached LLM response for listing '{listing_id_log}'")
            self.response_quality_log.append(
                {"timestamp": datetime.now().isoformat(), "prompt_id": prompt_id, "template": selected_template,
                 "cached": True, **cached_result})
            cached_result["response_time_s"] = time.time() - start_time
            return {**cached_result, "cached": True, "prompt_sent": prompt_content}

        logger.info(
            f"Sending prompt (ID: {prompt_id}, Length: {len(prompt_content)}) to OpenRouter model '{self.model}'...")
        headers = {"Authorization": f"Bearer {self.openrouter_api_key}", "Content-Type": "application/json"}
        payload = {"model": self.model, "messages": [{"role": "user", "content": prompt_content}]}

        parsed_recommendation = {}
        try:
            response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=120)
            response.raise_for_status()
            llm_full_response_content = response.json()['choices'][0]['message']['content']
            response_time_s_val = time.time() - start_time
            logger.info(f"OpenRouter response received in {response_time_s_val:.2f}s. Parsing...")
            parsed_recommendation = self._parse_recommendation(llm_full_response_content)
            parsed_recommendation["response_time_s"] = response_time_s_val
        except requests.exceptions.HTTPError as e:
            response_time_s_val = time.time() - start_time
            logger.error(f"OpenRouter API HTTP Error: {e.response.status_code} - {e.response.text}", exc_info=False)
            try:
                error_content = e.response.json(); llm_api_error_obj = {"error_type": "OpenRouterHTTPError",
                                                                        "message": error_content.get("error", {}).get(
                                                                            "message", e.response.text),
                                                                        "status_code": e.response.status_code,
                                                                        "details": error_content}
            except json.JSONDecodeError:
                llm_api_error_obj = {"error_type": "OpenRouterHTTPError", "message": e.response.text,
                                     "status_code": e.response.status_code}
            parsed_recommendation = {"recommended_price": None,
                                     "reasoning": f"OpenRouter API Error: {llm_api_error_obj['message']}",
                                     "confidence": 0.0, "success": False, "has_recommendation": False,
                                     "error": llm_api_error_obj, "response_time_s": response_time_s_val,
                                     "primary_factors": [], "price_range_suggestion": {"min": None, "max": None}}
        except requests.exceptions.RequestException as e:
            response_time_s_val = time.time() - start_time
            logger.error(f"OpenRouter API Request Error: {e}", exc_info=True)
            llm_api_error_obj = {"error_type": "OpenRouterRequestError", "message": str(e)}
            parsed_recommendation = {"recommended_price": None, "reasoning": f"OpenRouter Request Error: {str(e)}",
                                     "confidence": 0.0, "success": False, "has_recommendation": False,
                                     "error": llm_api_error_obj, "response_time_s": response_time_s_val,
                                     "primary_factors": [], "price_range_suggestion": {"min": None, "max": None}}
        except Exception as e:
            response_time_s_val = time.time() - start_time
            logger.error(f"Unexpected error during OpenRouter call: {e}", exc_info=True)
            llm_api_error_obj = {"error_type": type(e).__name__, "message": str(e)}
            parsed_recommendation = {"recommended_price": None, "reasoning": f"Unexpected Error: {str(e)}",
                                     "confidence": 0.0, "success": False, "has_recommendation": False,
                                     "error": llm_api_error_obj, "response_time_s": response_time_s_val,
                                     "primary_factors": [], "price_range_suggestion": {"min": None, "max": None}}

        if "response_time_s" not in parsed_recommendation: parsed_recommendation[
            "response_time_s"] = time.time() - start_time

        quality_metrics = {"timestamp": datetime.now().isoformat(), "prompt_id": prompt_id,
                           "template": selected_template, "cached": False, **parsed_recommendation}
        self.response_quality_log.append(quality_metrics)
        perf_metrics = {"success": parsed_recommendation.get("success", False),
                        "has_recommendation": parsed_recommendation.get("has_recommendation", False),
                        "response_time_s": parsed_recommendation.get("response_time_s", 0.0),
                        "confidence": parsed_recommendation.get("confidence", 0.0),
                        "recommended_price": parsed_recommendation.get("recommended_price"),
                        "error": parsed_recommendation.get("error")}
        self.prompt_manager.record_performance(prompt_id, perf_metrics)

        if use_cache and parsed_recommendation.get("success", False):
            self.response_cache[cache_key] = parsed_recommendation.copy()
            logger.debug(f"Cached LLM response for key: {cache_key[:100]}...")

        return {**parsed_recommendation, "cached": False, "prompt_sent": prompt_content}

    def analyze_performance(self, last_n: Optional[int] = None) -> Dict:
        if not self.response_quality_log: return {"message": "No performance data available"}
        log_to_analyze = self.response_quality_log[-last_n:] if last_n else self.response_quality_log;
        if not log_to_analyze: return {"message": f"No performance data in the specified range (last {last_n})"}
        try:
            df = pd.DataFrame(log_to_analyze);
            required_cols = ['success', 'has_recommendation', 'confidence', 'response_time_s', 'template', 'cached',
                             'error']
            for col in required_cols:
                if col not in df.columns:
                    if col in ['success', 'has_recommendation', 'cached']:
                        df[col] = False
                    elif col in ['confidence', 'response_time_s']:
                        df[col] = 0.0
                    elif col == 'template':
                        df[col] = 'UnknownTemplate'
                    else:
                        df[col] = None
            df['confidence'] = pd.to_numeric(df['confidence'], errors='coerce').fillna(0.0);
            df['response_time_s'] = pd.to_numeric(df['response_time_s'], errors='coerce').fillna(0.0);
            df['cached'] = df['cached'].astype(bool);
            df['success'] = df['success'].astype(bool);
            df['has_recommendation'] = df['has_recommendation'].astype(bool)
            total_requests = len(df);
            cache_hits = df['cached'].sum();
            llm_calls_df = df[~df['cached']];
            llm_calls_attempted = len(llm_calls_df)
            results = {"period": f"Last {last_n}" if last_n else "All Time", "total_requests_analyzed": total_requests,
                       "cache_hits": int(cache_hits), "llm_calls_attempted": llm_calls_attempted,
                       "overall_parse_success_rate_percent": (df['success'].mean() * 100) if total_requests > 0 else 0,
                       "overall_recommendation_rate_percent": (
                                   df['has_recommendation'].mean() * 100) if total_requests > 0 else 0,
                       "average_confidence_overall": df['confidence'].mean() if total_requests > 0 else 0,
                       "average_llm_response_time_s": llm_calls_df[
                           'response_time_s'].mean() if llm_calls_attempted > 0 else 0,
                       "total_errors_logged": int(df['error'].notna().sum()), "template_performance": {}}
            templates = llm_calls_df['template'].unique()
            for template in templates:
                template_df = llm_calls_df[llm_calls_df['template'] == template];
                template_count = len(template_df)
                if template_count == 0: continue
                results["template_performance"][template] = {"llm_calls": template_count,
                                                             "parse_success_rate_percent": (
                                                                         template_df['success'].mean() * 100),
                                                             "recommendation_rate_percent": (template_df[
                                                                                                 'has_recommendation'].mean() * 100),
                                                             "average_confidence": template_df['confidence'].mean(),
                                                             "average_response_time_s": template_df[
                                                                 'response_time_s'].mean(),
                                                             "error_count": int(template_df['error'].notna().sum())}
            if results['total_errors_logged'] > 0: error_series = df.loc[df['error'].notna(), 'error'].astype(str);
            results["error_summary"] = error_series.value_counts().head(5).to_dict()
            logger.info(
                f"Performance analysis complete. Overall success rate: {results['overall_parse_success_rate_percent']:.2f}%");
            return convert_numpy_types(results)
        except Exception as e:
            logger.error(f"Error analyzing LLM performance: {e}", exc_info=True); return {
                "error": f"Failed to analyze performance: {e}"}

    def clear_cache(self) -> None:
        self.response_cache = {}; logger.info("LLM response cache cleared.")

    def save_cache(self, file_path: str = LLM_CACHE_FILE) -> bool:
        try:
            with open(file_path, 'wb') as f:
                pickle.dump(self.response_cache, f, protocol=pickle.HIGHEST_PROTOCOL)
            logger.info(f"LLM response cache saved to {file_path} ({len(self.response_cache)} items)");
            return True
        except Exception as e:
            logger.error(f"Error saving LLM response cache to {file_path}: {e}", exc_info=True); return False

    def load_cache(self, file_path: str = LLM_CACHE_FILE, silent: bool = False) -> bool:
        try:
            if os.path.exists(file_path):
                with open(file_path, 'rb') as f:
                    loaded_cache = pickle.load(f)
                if isinstance(loaded_cache, dict): self.response_cache = loaded_cache;
                if not silent:
                    logger.info(
                        f"LLM response cache loaded from {file_path} ({len(self.response_cache)} items)"); return True
                else:
                    logger.warning(
                        f"LLM cache file {file_path} has invalid format (not a dict). Starting with empty cache."); self.response_cache = {}; return False
            else:
                if not silent: logger.info("No LLM cache file found. Starting with empty cache."); return False
        except Exception as e:
            logger.error(f"Error loading LLM response cache from {file_path}: {e}. Starting with empty cache.",
                         exc_info=True); self.response_cache = {}; return False

class PricingOptimizer:
    """Main class for optimizing prices using API data, processed data, and LLM agent."""
    def __init__(self, api_manager: ApiManager, data_processor: DataProcessor, llm_agent: LLMAgent,
                 batch_size: int = 10):
        self.api_manager = api_manager
        self.data_processor = data_processor
        self.llm_agent = llm_agent
        self.optimization_results: Dict[Union[int, str], Dict] = {}
        self.market_feature_importance_scores: Dict[str, float] = {}
        self.batch_size = max(1, batch_size)
        self.optimization_history: List[Dict] = []
        self.market_data_cache: Dict[Union[int, str], Dict] = {}
        self.external_data_cache: Dict[str, Dict] = {}
        self.state_mapping = {"01": "Alabama", "02": "Alaska", "04": "Arizona", "05": "Arkansas", "06": "California", "08": "Colorado", "09": "Connecticut", "10": "Delaware", "11": "District of Columbia", "12": "Florida", "13": "Georgia", "15": "Hawaii", "16": "Idaho", "17": "Illinois", "18": "Indiana", "19": "Iowa", "20": "Kansas", "21": "Kentucky", "22": "Louisiana", "23": "Maine", "24": "Maryland", "25": "Massachusetts", "26": "Michigan", "27": "Minnesota", "28": "Mississippi", "29": "Missouri", "30": "Montana", "31": "Nebraska", "32": "Nevada", "33": "New Hampshire", "34": "New Jersey", "35": "New Mexico", "36": "New York", "37": "North Carolina", "38": "North Dakota", "39": "Ohio", "40": "Oklahoma", "41": "Oregon", "42": "Pennsylvania", "44": "Rhode Island", "45": "South Carolina", "46": "South Dakota", "47": "Tennessee", "48": "Texas", "49": "Utah", "50": "Vermont", "51": "Virginia", "53": "Washington", "54": "West Virginia", "55": "Wisconsin", "56": "Wyoming"}
        self.state_mapping_reverse = {v.lower(): k for k, v in self.state_mapping.items()}


    def _get_state_code_from_location(self, location_str: Optional[str]) -> Optional[str]:
        if not location_str or not isinstance(location_str, str): return None
        parts = [p.strip().lower() for p in location_str.split(',')];
        potential_state = None
        if len(parts) >= 2:
            if parts[-1] in ["united states", "us", "usa"] and len(parts) >= 3:
                potential_state = parts[-2]
            elif parts[-1] in self.state_mapping_reverse:
                potential_state = parts[-1]
            elif parts[-2] in self.state_mapping_reverse:
                potential_state = parts[-2]
        elif len(parts) == 1 and parts[0] in self.state_mapping_reverse:
            potential_state = parts[0]
        if potential_state: state_code = self.state_mapping_reverse.get(potential_state);
        if state_code:
            logger.debug(
                f"Identified state '{potential_state}' code '{state_code}' from '{location_str}'."); return state_code
        else:
            logger.debug(f"Mapped potential state '{potential_state}' but no code found.")
        logger.debug(f"Could not determine state code from location: '{location_str}'");
        return None

    def find_comparable_listings(self, listing_row: pd.Series, max_comps: int = 200) -> pd.DataFrame:
            """
            Finds the best comparable listings using H3 k-rings and similarity filtering.
            """
            market_df = self.data_processor.market_df
            listing_id = listing_row.get('id')

            h3_resolution = 8
            h3_col = f'h3_res{h3_resolution}'
            if h3_col not in market_df.columns:
                logger.info(f"Generating H3 index column '{h3_col}' for market data...")
                market_df[h3_col] = market_df.apply(
                    lambda row: h3.geo_to_h3(row['lat'], row['long'], h3_resolution) if pd.notna(
                        row['lat']) and pd.notna(row['long']) else None,
                    axis=1
                )

            target_h3_index = listing_row.get(h3_col)
            if pd.isna(target_h3_index):
                if pd.notna(listing_row['lat']) and pd.notna(listing_row['long']):
                    target_h3_index = h3.geo_to_h3(listing_row['lat'], listing_row['long'], h3_resolution)
                else:
                    logger.warning(f"Listing {listing_id} has no coordinates. Cannot find H3 comps.")
                    return pd.DataFrame()

            comps_pool = pd.DataFrame()
            for k in range(1, 6):
                ring_hexagons = h3.k_ring(target_h3_index, k)
                comps_pool = market_df[market_df[h3_col].isin(ring_hexagons)].copy()
                if listing_id is not None:
                    comps_pool = comps_pool[comps_pool['id'] != listing_id]

                if len(comps_pool) >= max_comps:
                    logger.info(f"Found {len(comps_pool)} listings within k-ring of {k} for listing {listing_id}.")
                    break

            if comps_pool.empty:
                logger.warning(
                    f"No listings found even within a k-ring of 5 for listing {listing_id}. Using broader fallback.")
                listing_bedrooms = listing_row.get('bedrooms')
                if pd.notna(listing_bedrooms) and 'bedrooms' in market_df.columns:
                    comps_pool = market_df[market_df['bedrooms'] == listing_bedrooms].copy()
                else:
                    comps_pool = market_df.sample(n=min(max_comps, len(market_df))).copy()
                return comps_pool.head(max_comps)

            listing_bedrooms = listing_row.get('bedrooms')
            if pd.notna(listing_bedrooms) and 'bedrooms' in comps_pool.columns:
                same_br_comps = comps_pool[comps_pool['bedrooms'] == listing_bedrooms]
                if len(same_br_comps) >= 20:
                    comps_pool = same_br_comps
                    logger.info(
                        f"Refined comparable pool to {len(comps_pool)} listings with same bedroom count ({int(listing_bedrooms)}).")

            max_rating = comps_pool['ratings'].max()
            if max_rating > 0:
                comps_pool['rating_score'] = comps_pool['ratings'] / max_rating
            else:
                comps_pool['rating_score'] = 0

            distances = comps_pool.apply(
                lambda row: h3.point_dist((listing_row['lat'], listing_row['long']), (row['lat'], row['long']),
                                          unit='km'),
                axis=1
            )
            max_dist = distances.max()
            if max_dist > 0:
                comps_pool['distance_score'] = 1 - (distances / max_dist)
            else:
                comps_pool['distance_score'] = 1

            comps_pool['similarity_score'] = (0.6 * comps_pool['distance_score']) + (0.4 * comps_pool['rating_score'])

            final_comps = comps_pool.sort_values(by='similarity_score', ascending=False).head(max_comps)

            logger.info(f"Selected final {len(final_comps)} comparable listings for listing {listing_id}.")
            return final_comps

    def _get_market_data_for_listing(self, listing_row: pd.Series) -> Dict:
            listing_id = listing_row.get('id')
            listing_key = str(listing_id) if pd.notna(listing_id) else f"idx_{listing_row.name}"

            if listing_key in self.market_data_cache:
                logger.debug(f"Using cached market data for listing {listing_key}")
                return self.market_data_cache[listing_key]

            market_defaults = {"avg_price": 0.0, "median_price": 0.0, "avg_rating": 0.0, "avg_occupancy": 0.0,
                               "avg_price_per_bedroom": 0.0, "avg_price_per_guest": 0.0, "similar_listings_count": 0}

            if self.data_processor.market_df is None or self.data_processor.market_df.empty:
                logger.warning(f"No market data loaded. Returning default market values for listing {listing_key}.")
                self.market_data_cache[listing_key] = market_defaults
                return market_defaults

            nearby_listings = self.find_comparable_listings(listing_row, max_comps=200)

            if nearby_listings.empty:
                logger.warning(f"No comparable listings found for {listing_key} after all fallbacks.")
                self.market_data_cache[listing_key] = market_defaults
                return market_defaults

            result = market_defaults.copy()

            def safe_stat(series, stat_func):
                numeric_series = pd.to_numeric(series, errors='coerce').dropna()
                return stat_func(numeric_series) if not numeric_series.empty else 0.0

            result["similar_listings_count"] = len(nearby_listings)
            if 'price' in nearby_listings.columns:
                result["avg_price"] = safe_stat(nearby_listings['price'], np.mean)
                result["median_price"] = safe_stat(nearby_listings['price'], np.median)
            if 'ratings' in nearby_listings.columns:
                result["avg_rating"] = safe_stat(nearby_listings['ratings'], np.mean)
            if 'price' in nearby_listings.columns and 'bedrooms' in nearby_listings.columns:
                ppb_temp = nearby_listings.apply(
                    lambda r: r['price'] / r['bedrooms'] if pd.notna(r['bedrooms']) and r['bedrooms'] > 0 and pd.notna(
                        r['price']) else np.nan, axis=1)
                result["avg_price_per_bedroom"] = safe_stat(ppb_temp, np.mean)
            if 'price' in nearby_listings.columns and 'guests' in nearby_listings.columns:
                ppg_temp = nearby_listings.apply(
                    lambda r: r['price'] / r['guests'] if pd.notna(r['guests']) and r['guests'] > 0 and pd.notna(
                        r['price']) else np.nan, axis=1)
                result["avg_price_per_guest"] = safe_stat(ppg_temp, np.mean)
            if 'avg_occupancy' in nearby_listings.columns:
                result["avg_occupancy"] = safe_stat(nearby_listings['avg_occupancy'], np.mean)

            for k, v in result.items():
                if pd.isna(v): result[k] = 0.0

            self.market_data_cache[listing_key] = result
            return result

    def _get_external_data_for_listing(self, listing_row: pd.Series) -> Dict:
        state_code = self._get_state_code_from_location(listing_row.get("location"));
        lat = listing_row.get('lat');
        lon = listing_row.get('long')
        loc_key_part = f"state_{state_code}" if state_code else f"ll_{lat:.2f}_{lon:.2f}" if pd.notna(lat) and pd.notna(
            lon) else "loc_unknown";
        cache_key = f"external_{loc_key_part}"
        if cache_key in self.external_data_cache: logger.debug(
            f"Using cached external data for location key {loc_key_part}"); return self.external_data_cache[cache_key]
        external_defaults = {"vacancy_rate": 0.0, "upcoming_events": 0, "market_trend_source": "N/A",
                             "market_trend": "Stable", "fred_hpi_trend": "Unknown",
                             "fred_mortgage_rate_level": "Unknown"};
        result = external_defaults.copy()
        if state_code:
            logger.debug(f"Fetching Census data for state code {state_code}...");
            vacancy_data = self.api_manager.get_census_vacancy_data(state=state_code)
            if vacancy_data.get("success") and isinstance(vacancy_data.get("data"), list) and vacancy_data["data"]:
                try:
                    latest_record = vacancy_data["data"][0];
                    rate_str = latest_record.get("VACANCY_RATE")
                    if rate_str is not None and rate_str != '.':
                        result["vacancy_rate"] = float(rate_str); logger.debug(
                            f"Vacancy rate for state {state_code}: {result['vacancy_rate']:.1f}%")
                    else:
                        logger.warning(f"Vacancy rate missing or invalid ('.') in Census data for state {state_code}.")
                except (IndexError, ValueError, TypeError, KeyError) as e:
                    logger.warning(f"Could not parse vacancy rate from Census data for state {state_code}: {e}")
            else:
                logger.warning(
                    f"Failed to fetch or parse Census vacancy data for state {state_code}. Error: {vacancy_data.get('error')}")
        else:
            logger.debug(
                f"Could not determine state code for listing {listing_row.get('id', listing_row.name)}. Skipping Census data.")
        event_radius_miles = 25
        if pd.notna(lat) and pd.notna(lon):
            logger.debug(
                f"Fetching Ticketmaster events near ({lat:.4f}, {lon:.4f}), radius {event_radius_miles} miles...");
            events_data = self.api_manager.get_ticketmaster_events(lat, lon, radius=event_radius_miles)
            if events_data.get("success") and isinstance(events_data.get("data"), dict):
                event_list = events_data["data"].get("_embedded", {}).get("events", []); result[
                    "upcoming_events"] = len(event_list) if isinstance(event_list, list) else 0; logger.debug(
                    f"Found {result['upcoming_events']} upcoming events.")
            else:
                logger.warning(f"Failed to fetch or parse Ticketmaster events. Error: {events_data.get('error')}")
        else:
            logger.debug(
                f"Lat/Long missing for listing {listing_row.get('id', listing_row.name)}. Skipping Ticketmaster events.")
        trend_symbol = "SPY";
        result["market_trend_source"] = trend_symbol;
        logger.debug(f"Fetching market trend indicator: {trend_symbol} from Alpha Vantage...");
        market_index_data = self.api_manager.get_alpha_vantage_stock_data(trend_symbol)
        if market_index_data.get("success") and isinstance(market_index_data.get("data"), dict):
            time_series = market_index_data["data"].get("Time Series (Daily)")
            if isinstance(time_series, dict) and len(time_series) >= 25:
                try:
                    dates = sorted(time_series.keys(), reverse=True);
                    recent_prices = [float(time_series[d]["4. close"]) for d in dates[:5]];
                    earlier_prices = [float(time_series[d]["4. close"]) for d in dates[20:25]]
                    if recent_prices and earlier_prices:
                        avg_recent = sum(recent_prices) / len(recent_prices);
                        avg_earlier = sum(earlier_prices) / len(earlier_prices)
                        if avg_earlier != 0:
                            percent_change = ((avg_recent - avg_earlier) / avg_earlier) * 100
                            if percent_change > 5:
                                result["market_trend"] = "Strong Upward"
                            elif percent_change > 1.5:
                                result["market_trend"] = "Upward"
                            elif percent_change < -5:
                                result["market_trend"] = "Strong Downward"
                            elif percent_change < -1.5:
                                result["market_trend"] = "Downward"
                            else:
                                result["market_trend"] = "Stable"
                            logger.debug(
                                f"Market trend ({trend_symbol}): '{result['market_trend']}' ({percent_change:.2f}% change over ~1 month).")
                        else:
                            logger.warning(f"Cannot calculate trend for {trend_symbol}, earlier average price is zero.")
                    else:
                        logger.warning(
                            f"Could not extract sufficient price points for {trend_symbol} trend calculation.")
                except (ValueError, TypeError, KeyError, IndexError) as e:
                    logger.warning(f"Could not calculate trend from {trend_symbol} data: {e}")
            else:
                logger.warning(
                    f"Not enough data points ({len(time_series or {})}) received for {trend_symbol} trend analysis.")
        else:
            logger.warning(
                f"Failed to fetch or parse market index data ({trend_symbol}). Error: {market_index_data.get('error')}")
        fred_series_ids = ["CSUSHPINSA", "MORTGAGE30US"];
        logger.debug(f"Fetching FRED data for series: {fred_series_ids}...");
        fred_data = self.api_manager.get_fred_housing_data(fred_series_ids)
        if fred_data.get("success") and isinstance(fred_data.get("data"), dict):
            fred_results = fred_data["data"];
            hpi_data = fred_results.get("CSUSHPINSA");
            mortgage_data = fred_results.get("MORTGAGE30US")
            if isinstance(hpi_data, list) and len(hpi_data) >= 4:
                try:
                    latest_hpi_str = hpi_data[0].get('value');
                    prev_hpi_str = hpi_data[3].get('value')
                    if latest_hpi_str != '.' and prev_hpi_str != '.':
                        latest_hpi = float(latest_hpi_str);
                        prev_hpi = float(prev_hpi_str)
                        if prev_hpi != 0:
                            hpi_change_pct = ((latest_hpi - prev_hpi) / prev_hpi) * 100
                            if hpi_change_pct > 1.5:
                                result["fred_hpi_trend"] = "Increasing"
                            elif hpi_change_pct < -1.5:
                                result["fred_hpi_trend"] = "Decreasing"
                            else:
                                result["fred_hpi_trend"] = "Stable"
                            logger.debug(
                                f"FRED HPI Trend (National): {result['fred_hpi_trend']} ({hpi_change_pct:.2f}% change over ~3 months).")
                        else:
                            logger.warning("Previous FRED HPI value is zero, cannot calculate trend.")
                    else:
                        logger.warning("Missing values ('.') encountered in FRED HPI data.")
                except (ValueError, TypeError, KeyError, IndexError) as e:
                    logger.warning(f"Could not calculate FRED HPI trend: {e}")
            else:
                logger.warning(f"Not enough data points received for FRED HPI ({len(hpi_data or [])}) trend analysis.")
            if isinstance(mortgage_data, list) and len(mortgage_data) > 0:
                try:
                    latest_rate_str = mortgage_data[0].get('value')
                    if latest_rate_str != '.':
                        latest_rate = float(latest_rate_str)
                        if latest_rate > 7.0:
                            result["fred_mortgage_rate_level"] = "High"
                        elif latest_rate < 4.5:
                            result["fred_mortgage_rate_level"] = "Low"
                        else:
                            result["fred_mortgage_rate_level"] = "Moderate"
                        logger.debug(
                            f"FRED Mortgage Rate Level (30yr Fixed): {result['fred_mortgage_rate_level']} ({latest_rate:.2f}%).")
                    else:
                        logger.warning("Latest FRED Mortgage Rate value is missing ('.').")
                except (ValueError, TypeError, KeyError, IndexError) as e:
                    logger.warning(f"Could not determine FRED Mortgage Rate level: {e}")
            else:
                logger.warning(f"No data points received for FRED Mortgage Rate ({len(mortgage_data or [])}).")
        else:
            logger.warning(f"Failed to fetch or parse FRED housing data. Error: {fred_data.get('error')}")
        self.external_data_cache[cache_key] = result;
        logger.debug(f"Cached external data for location key {loc_key_part}.");
        return result

    # def optimize_single_listing(self, listing_idx: int, target_roi: float, prompt_template: str = None,
    #                             use_cache: bool = True) -> Optional[Dict]:
    #     if self.data_processor.listings_df is None:
    #         logger.error("Listings data not loaded.");
    #         return None
    #     if not (0 <= listing_idx < len(self.data_processor.listings_df)):
    #         logger.error(f"Invalid listing index: {listing_idx}.");
    #         return None
    #
    #     listing_row = self.data_processor.listings_df.iloc[listing_idx]
    #     listing_id = str(listing_row.get('id', f"idx_{listing_idx}"))
    #
    #     try:
    #         listing_dict_for_llm = convert_numpy_types(listing_row.to_dict())
    #         listing_dict_for_llm['listing_id'] = listing_id
    #
    #         market_data = self._get_market_data_for_listing(listing_row)
    #         external_data = self._get_external_data_for_listing(listing_row)
    #
    #         importance_scores = self.market_feature_importance_scores
    #         present_amenity_scores = {}
    #         missing_high_value_amenities = {}
    #         POSITIVE_THRESHOLD = 5.0
    #         SIGNIFICANCE_THRESHOLD = 1.0
    #         listing_amenities_raw = listing_dict_for_llm.get('amenities_list', [])
    #         listing_amenities_set = set(
    #             str(a).lower().strip() for a in listing_amenities_raw if isinstance(a, str) and str(a).strip())
    #
    #         for amenity_name, score in importance_scores.items():
    #             is_present = amenity_name.lower().strip() in listing_amenities_set
    #             if is_present:
    #                 if abs(score) >= SIGNIFICANCE_THRESHOLD: present_amenity_scores[
    #                     amenity_name.capitalize()] = f"{score:+.2f}"
    #             elif score >= POSITIVE_THRESHOLD:
    #                 missing_high_value_amenities[amenity_name.capitalize()] = f"{score:+.2f}"
    #
    #         sorted_present = dict(sorted(present_amenity_scores.items(), key=lambda item: float(item[1]), reverse=True))
    #         sorted_missing = dict(
    #             sorted(missing_high_value_amenities.items(), key=lambda item: float(item[1]), reverse=True))
    #         present_summary = ", ".join(
    #             [f"{name} (Score: {score})" for name, score in sorted_present.items()]) or "None significant"
    #         missing_summary = ", ".join(
    #             [f"{name} (Score: {score})" for name, score in sorted_missing.items()]) or "None significant"
    #         prompt_extra_params = {'present_amenity_scores_summary': present_summary,
    #                                'missing_valuable_amenities_summary': missing_summary}
    #
    #         llm_result = self.llm_agent.get_pricing_recommendation(
    #             listing_data=listing_dict_for_llm, market_data=market_data, external_data=external_data,
    #             target_roi=target_roi, use_cache=use_cache, template=prompt_template,
    #             prompt_extra_data=prompt_extra_params
    #         )
    #
    #         prompt_sent = llm_result.get("prompt_sent", "PROMPT NOT FOUND")
    #
    #         opt_result_payload = {
    #             "listing_index": listing_idx,
    #             "listing_id": listing_id,
    #             "current_price": listing_dict_for_llm.get("price"),
    #             "target_roi": target_roi,
    #             "prompt_template_used": prompt_template or self.llm_agent.prompt_manager.get_best_template(),
    #             "llm_output": llm_result,
    #             "llm_prompt_sent": prompt_sent,
    #             "listing_data_summary": {"price": listing_dict_for_llm.get("price"),
    #                                      "bedrooms": listing_dict_for_llm.get("bedrooms"),
    #                                      "location": listing_dict_for_llm.get("location")},
    #             "market_data_used": market_data,
    #             "external_data_used": external_data,
    #             "amenity_scores_calculated": present_amenity_scores,
    #             "missing_high_value_amenities": missing_high_value_amenities,
    #             "timestamp": datetime.now().isoformat()
    #         }
    #         self.optimization_results[listing_id] = opt_result_payload
    #         self.optimization_history.append(
    #             {"listing_index": listing_idx, "listing_id": listing_id, "timestamp": opt_result_payload["timestamp"],
    #              "current_price": opt_result_payload["current_price"],
    #              "recommended_price": llm_result.get("recommended_price"), "confidence": llm_result.get("confidence"),
    #              "success": llm_result.get("recommended_price") is not None})
    #
    #         return opt_result_payload
    #
    #     except Exception as e:
    #         logger.error(f"Unexpected error optimizing listing index {listing_idx} (ID: {listing_id}): {e}",
    #                      exc_info=True)
    #         self.optimization_results[listing_id] = {"listing_index": listing_idx, "listing_id": listing_id,
    #                                                  "error": str(e), "timestamp": datetime.now().isoformat(),
    #                                                  "current_price": listing_row.get('price'),
    #                                                  "recommended_price": None, "confidence": 0.0}
    #         return self.optimization_results[listing_id]

    def optimize_single_listing(self, listing_idx: int, target_roi: float, prompt_template: str = None,
                                use_cache: bool = True) -> Optional[Dict]:
        if self.data_processor.listings_df is None:
            logger.error("Listings data not loaded.");
            return None
        if not (0 <= listing_idx < len(self.data_processor.listings_df)):
            logger.error(f"Invalid listing index: {listing_idx}.");
            return None

        listing_row = self.data_processor.listings_df.iloc[listing_idx]
        listing_id = str(listing_row.get('id', f"idx_{listing_idx}"))

        try:
            listing_dict_for_llm = convert_numpy_types(listing_row.to_dict())
            listing_dict_for_llm['listing_id'] = listing_id

            market_data = self._get_market_data_for_listing(listing_row)
            external_data = self._get_external_data_for_listing(listing_row)

            importance_scores = self.market_feature_importance_scores
            present_amenity_scores = {}
            missing_high_value_amenities = {}
            POSITIVE_THRESHOLD = 5.0
            SIGNIFICANCE_THRESHOLD = 1.0
            listing_amenities_raw = listing_dict_for_llm.get('amenities_list', [])
            listing_amenities_set = set(
                str(a).lower().strip() for a in listing_amenities_raw if isinstance(a, str) and str(a).strip())

            for amenity_name, score in importance_scores.items():
                is_present = amenity_name.lower().strip() in listing_amenities_set
                if is_present:
                    if abs(score) >= SIGNIFICANCE_THRESHOLD: present_amenity_scores[
                        amenity_name.capitalize()] = f"{score:+.2f}"
                elif score >= POSITIVE_THRESHOLD:
                    missing_high_value_amenities[amenity_name.capitalize()] = f"{score:+.2f}"

            # --- These are the dictionaries we need to save ---
            sorted_present_desc = sorted(present_amenity_scores.items(), key=lambda item: float(item[1]), reverse=True)
            sorted_present_asc = sorted(present_amenity_scores.items(), key=lambda item: float(item[1]), reverse=False)

            # Combine top 3 positive and top 2 negative (or fewer if not available)
            top_present_amenities = dict(sorted_present_desc[:3] + sorted_present_asc[:2])

            # Sort missing amenities to get the top 5 most valuable ones
            sorted_missing_desc = sorted(missing_high_value_amenities.items(), key=lambda item: float(item[1]),
                                         reverse=True)
            top_missing_amenities = dict(sorted_missing_desc[:5])

            # Create summaries for the prompt (this part is for the AI's internal use)
            present_summary = ", ".join(
                [f"{name} (Score: {score})" for name, score in top_present_amenities.items()]) or "None significant"
            missing_summary = ", ".join(
                [f"{name} (Score: {score})" for name, score in top_missing_amenities.items()]) or "None significant"
            prompt_extra_params = {'present_amenity_scores_summary': present_summary,
                                   'missing_valuable_amenities_summary': missing_summary}

            llm_result = self.llm_agent.get_pricing_recommendation(
                listing_data=listing_dict_for_llm, market_data=market_data, external_data=external_data,
                target_roi=target_roi, use_cache=use_cache, template=prompt_template,
                prompt_extra_data=prompt_extra_params
            )

            prompt_sent = llm_result.get("prompt_sent", "PROMPT NOT FOUND")

            # --- THIS IS THE CORRECTED PAYLOAD ---
            opt_result_payload = {
                "listing_index": listing_idx,
                "listing_id": listing_id,
                "current_price": listing_dict_for_llm.get("price"),
                "target_roi": target_roi,
                "prompt_template_used": prompt_template or self.llm_agent.prompt_manager.get_best_template(),
                "llm_output": llm_result,
                "llm_prompt_sent": prompt_sent,
                "listing_data_summary": {"price": listing_dict_for_llm.get("price"),
                                         "bedrooms": listing_dict_for_llm.get("bedrooms"),
                                         "location": listing_dict_for_llm.get("location")},
                "market_data_used": market_data,
                "external_data_used": external_data,
                "amenity_scores_calculated": top_present_amenities,
                "missing_valuable_amenities": top_missing_amenities,
                "timestamp": datetime.now().isoformat()
            }
            # --- END OF CORRECTION ---

            self.optimization_results[listing_id] = opt_result_payload
            self.optimization_history.append(
                {"listing_index": listing_idx, "listing_id": listing_id, "timestamp": opt_result_payload["timestamp"],
                 "current_price": opt_result_payload["current_price"],
                 "recommended_price": llm_result.get("recommended_price"), "confidence": llm_result.get("confidence"),
                 "success": llm_result.get("recommended_price") is not None})

            return opt_result_payload

        except Exception as e:
            logger.error(f"Unexpected error optimizing listing index {listing_idx} (ID: {listing_id}): {e}",
                         exc_info=True)
            self.optimization_results[listing_id] = {
                "listing_index": listing_idx,
                "listing_id": listing_id,
                "error": str(e),
                "timestamp": datetime.now().isoformat(),
                "current_price": listing_row.get('price'),
                "recommended_price": None,
                "confidence": 0.0,
                # --- ADD FALLBACK KEYS HERE TOO ---
                "amenity_scores_calculated": {},
                "missing_valuable_amenities": {}
            }
            return self.optimization_results[listing_id]

    def get_results_as_dataframe(self) -> pd.DataFrame:
        """
        Exports the optimization results to a pandas DataFrame.
        This version correctly serializes complex data types to JSON strings
        and preserves original dictionary columns for use by the main application.
        """
        if not self.optimization_results:
            logger.warning("No optimization results to export.")
            return pd.DataFrame()

        logger.info(f"Exporting {len(self.optimization_results)} optimization results to DataFrame...")
        results_list = list(self.optimization_results.values())
        if not results_list:
            return pd.DataFrame()

        results_df = pd.DataFrame(results_list)

        # Flatten nested dictionaries while keeping the original columns
        if 'market_data_used' in results_df.columns:
            market_df_flat = pd.json_normalize(results_df['market_data_used']).add_prefix('market_')
            results_df = pd.concat([results_df, market_df_flat], axis=1)

        if 'external_data_used' in results_df.columns:
            external_df_flat = pd.json_normalize(results_df['external_data_used']).add_prefix('external_')
            results_df = pd.concat([results_df, external_df_flat], axis=1)

        if 'llm_output' in results_df.columns:
            llm_prefix = 'llm_'
            keys_to_extract = ["recommended_price", "reasoning", "confidence", "success", "has_recommendation", "error",
                               "response_time_s", "primary_factors", "price_range_suggestion.min",
                               "price_range_suggestion.max"]
            normalized_llm_output = pd.json_normalize(results_df['llm_output'])
            for key_path in keys_to_extract:
                target_col_name = llm_prefix + key_path.replace('.', '_')
                if key_path in normalized_llm_output.columns:
                    results_df[target_col_name] = normalized_llm_output[key_path]
                else:
                    results_df[target_col_name] = None
            results_df = results_df.drop('llm_output', axis=1, errors='ignore')

        # Define column order
        core_cols = ['listing_index', 'listing_id', 'timestamp', 'current_price', 'target_roi', 'prompt_template_used']
        reco_cols = ['llm_recommended_price', 'llm_confidence', 'llm_reasoning', 'llm_price_range_suggestion_min',
                     'llm_price_range_suggestion_max', 'llm_primary_factors']
        chain_of_thought_cols = ['llm_prompt_sent']
        perf_cols = ['llm_response_time_s', 'llm_error', 'llm_success', 'llm_has_recommendation']
        market_cols_prefix = 'market_'
        external_cols_prefix = 'external_'
        dict_cols = [
            'market_data_used',
            'external_data_used',
            'amenity_scores_calculated',
            'missing_valuable_amenities'  # Corrected from missing_high_value_amenities
        ]

        final_column_order = []
        final_column_order.extend([col for col in core_cols if col in results_df.columns])
        final_column_order.extend([col for col in reco_cols if col in results_df.columns])
        final_column_order.extend([col for col in chain_of_thought_cols if col in results_df.columns])
        final_column_order.extend([col for col in perf_cols if col in results_df.columns])
        final_column_order.extend(sorted([col for col in results_df.columns if col.startswith(market_cols_prefix)]))
        final_column_order.extend(sorted([col for col in results_df.columns if col.startswith(external_cols_prefix)]))
        final_column_order.extend([col for col in dict_cols if col in results_df.columns])
        remaining_cols = [col for col in results_df.columns if col not in final_column_order]
        final_column_order.extend(sorted(remaining_cols))

        results_df = results_df[[col for col in final_column_order if col in results_df.columns]]

        # Correctly serialize complex objects to valid JSON strings
        cols_to_serialize = [
            'llm_primary_factors',
            'market_data_used',
            'external_data_used',
            'amenity_scores_calculated',
            'missing_high_value_amenities'
        ]

        for col in cols_to_serialize:
            if col in results_df.columns:
                if results_df[col].apply(lambda x: isinstance(x, (list, dict))).any():
                    logger.debug(f"Converting complex objects in column '{col}' to JSON string.")
                    results_df[col] = results_df[col].apply(
                        lambda x: json.dumps(convert_numpy_types(x)) if isinstance(x, (list, dict)) else x
                    )

        return results_df

# --- MAIN CALLABLE FUNCTION ---
def run_optimization_on_dataframes(
    listings_df: pd.DataFrame,
    market_df: Optional[pd.DataFrame],
    use_cache: bool = True
) -> Optional[pd.DataFrame]:
    """
    Orchestrates the entire optimization pipeline using provided DataFrames.
    """
    logger.info("--- Starting Optimization Pipeline on Provided DataFrames ---")

    if listings_df is None or listings_df.empty:
        logger.critical("FATAL: The provided listings_df is empty or None. Pipeline cannot continue.")
        return None

    listings_df_processed = listings_df.copy()
    if 'listid' in listings_df_processed.columns:
        listings_df_processed.rename(columns={'listid': 'id'}, inplace=True)
        logger.info("Renamed 'listid' column to 'id' in listings DataFrame for internal consistency.")

    if 'expected_roi' not in listings_df_processed.columns:
        logger.warning("Listings DataFrame does not contain 'expected_roi' column. Using default ROI of 15.0 for all listings.")

    logger.info("Initializing agent components...")
    api_keys = {
        "fred": os.environ.get("FRED_API_KEY"),
        "alpha_vantage": os.environ.get("ALPHA_VANTAGE_API_KEY"),
        "ticketmaster": os.environ.get("TICKETMASTER_API_KEY")
    }
    api_manager = ApiManager(api_keys=api_keys)
    data_processor = DataProcessor()
    prompt_manager = PromptManager()
    llm_agent = LLMAgent(prompt_manager=prompt_manager)
    optimizer = PricingOptimizer(api_manager, data_processor, llm_agent, batch_size=BATCH_SIZE)

    logger.info("Loading DataFrames into the data processor...")
    success, message = data_processor.load_dataframes(listings_df_processed, market_df)
    if not success:
        logger.critical(f"FATAL: Failed to process loaded DataFrames: {message}")
        return None

    if data_processor.market_df is not None and not data_processor.market_df.empty:
        logger.info("Calculating market feature importance (one-time setup)...")
        try:
            optimizer.market_feature_importance_scores = data_processor.calculate_market_feature_importance(
                target_variable='price', min_frequency=MIN_AMENITY_FREQUENCY)
            logger.info(
                f"Feature importance: {len(optimizer.market_feature_importance_scores) if optimizer.market_feature_importance_scores else 'No'} scores generated.")
        except Exception as e:
            logger.error(f"Error calculating feature importance: {e}. Proceeding without.", exc_info=True)
            optimizer.market_feature_importance_scores = {}
    else:
        logger.warning("Market data not provided or empty, skipping feature importance calculation.")

    total_listings = len(data_processor.listings_df)
    logger.info(f"Starting optimization for all {total_listings} listings...")

    for index in tqdm(range(total_listings), desc="Optimizing Listings"):
        listing_row = data_processor.listings_df.iloc[index]
        target_roi = listing_row.get('expected_roi', 15.0)
        if pd.isna(target_roi):
            target_roi = 15.0
            logger.debug(f"ROI for listing index {index} is NaN, using default {target_roi}%.")

        optimizer.optimize_single_listing(
            listing_idx=index,
            target_roi=float(target_roi),
            use_cache=use_cache
        )

        if (index + 1) % 25 == 0 or (index + 1) == total_listings:
            logger.info(f"Saving caches after processing {index + 1}/{total_listings} listings...")
            api_manager.save_cache()
            llm_agent.save_cache()
            prompt_manager.export_prompt_history()

    logger.info("Generating final results DataFrame...")
    results_df = optimizer.get_results_as_dataframe()

    logger.info("Saving all caches and history files one last time...")
    api_manager.save_cache()
    llm_agent.save_cache()
    prompt_manager.save_templates()
    prompt_manager.export_prompt_history()

    logger.info(f"--- Optimization Pipeline Finished. Processed {len(results_df)} listings. ---")
    return results_df
