# import pandas as pd
# import ast
# import json
# import re
#
# def get_top_missing_amenities(df, test):
#     """
#     Computes the top 10 most valuable amenities based on revenue and daily rates
#     while excluding amenities already present in the test dataframe.
#
#     Parameters:
#         df (pd.DataFrame): The main dataframe containing listings.
#         test (pd.DataFrame): The test dataframe containing amenities to be excluded.
#
#     Returns:
#         JSON string with the top 10 missing amenities by revenue and price impact.
#     """
#
#     # -------------------- Normalizing Amenities Function -------------------- #
#     def normalize_amenity_name(name):
#         """Remove extra spaces, special characters, and convert to lowercase."""
#         return re.sub(r"[\s\-:]+", " ", name.strip().lower())
#
#     def extract_amenity_names(amenities_json):
#         """Extract and normalize amenities from JSON field."""
#         try:
#             data = json.loads(amenities_json)
#         except (json.JSONDecodeError, TypeError):
#             return []
#
#         all_names = []
#         for group in data:
#             items = group.get("items", [])
#             for item in items:
#                 name = item.get("name")
#                 if name:
#                     cleaned_name = normalize_amenity_name(name)
#                     all_names.append(cleaned_name)
#         return all_names
#
#     # -------------------- Apply Cleaning to Amenities Column -------------------- #
#     df["amenities_list"] = df["amenities"].apply(extract_amenity_names)
#     test["amenities_list"] = test["amenities_list"].apply(ast.literal_eval)  # Convert from string to list
#
#     # Ensure all values are lists and remove duplicates
#     df["amenities_list"] = df["amenities_list"].apply(lambda x: list(set(x)) if isinstance(x, list) else [])
#     test["amenities_list"] = test["amenities_list"].apply(lambda x: list(set(x)) if isinstance(x, list) else [])
#
#     # -------------------- Get Unique Amenities -------------------- #
#     all_amenities = set(amenity for amenities in df["amenities_list"] for amenity in amenities)
#     test_amenities = set(amenity for amenities in test["amenities_list"] for amenity in amenities)
#
#     print(f"Total Unique Amenities in df: {len(all_amenities)}")
#     print(f"Total Amenities in test: {len(test_amenities)}")
#
#     # Remove amenities present in test from the unique list
#     missing_amenities = all_amenities - test_amenities
#     print(f"Total Missing Amenities to Consider: {len(missing_amenities)}")
#
#     # -------------------- Compute Amenity Value Impact (Revenue & Price) -------------------- #
#     amenity_price_impact = []
#     amenity_revenue_impact = []
#
#     for amenity in missing_amenities:
#         # Listings with and without the amenity
#         with_amenity = df[df["amenities_list"].apply(lambda x: amenity in x)]
#         without_amenity = df[df["amenities_list"].apply(lambda x: amenity not in x)]
#
#         if not with_amenity.empty and not without_amenity.empty:
#
#
#             # Compute average price difference
#             avg_price_with = with_amenity["price"].mean()
#             avg_price_without = without_amenity["price"].mean()
#             price_increase = ((avg_price_with - avg_price_without) / avg_price_without) * 100
#
#
#             amenity_price_impact.append((amenity, price_increase))
#
#     # -------------------- Get Top 10 Most Valuable Missing Amenities -------------------- #
#
#     top_10_price_amenities = sorted(amenity_price_impact, key=lambda x: x[1], reverse=True)[:10]
#
#     # Convert to JSON Output
#     output = {
#
#         "top_10_missing_amenities_by_price": {amenity: f"{increase:.2f}%" for amenity, increase in top_10_price_amenities}
#     }
#
#     return json.dumps(output, indent=4, ensure_ascii=False)
#
#
#
# df = pd.read_csv("general.csv")
# test = pd.read_csv("processed_test.csv")
# processed_json = get_top_missing_amenities(df, test)
# print(processed_json)
#
#
# #################################################################################################################
#


import pandas as pd
import json
import re
from ast import literal_eval
import json
## imp

# def get_top_missing_amenities(df, test):
#     # Precompile regex pattern for faster normalization
#     pattern = re.compile(r"[\s\-:]+")
#
#     def normalize_amenity_name(name):
#         return pattern.sub(" ", name.strip().lower())
#
#     def extract_amenity_names(amenities_json):
#         """Optimized amenities extractor with list comprehensions"""
#         try:
#             data = json.loads(amenities_json)
#         except (json.JSONDecodeError, TypeError):
#             return []
#
#         # Using list comprehension for faster processing
#         return list(set(
#             normalize_amenity_name(item["name"])
#             for group in data
#             for item in group.get("items", [])
#             if "name" in item
#         ))
#
#     # -------------------- Data Preprocessing -------------------- #
#     # Process main dataframe's amenities
#     df["amenities_list"] = df["amenities"].apply(extract_amenity_names)
#
#     # Process test dataframe's amenities with vectorized operations
#     test["amenities_list"] = test["amenities_list"].apply(
#         lambda x: list(set(literal_eval(x))) if isinstance(x, str) else []
#     )
#
#     # -------------------- Amenity Analysis -------------------- #
#     # Explode amenities to find missing ones using vectorized operations
#     df_amenities = df["amenities_list"].explode().dropna()
#     test_amenities = test["amenities_list"].explode().dropna()
#
#     missing_amenities = set(df_amenities.unique()) - set(test_amenities.unique())
#
#     # -------------------- Price Impact Calculation -------------------- #
#     # Precompute statistics using vectorized operations
#     exploded_prices = df.explode("amenities_list")[["amenities_list", "price"]]
#     amenity_stats = exploded_prices.groupby("amenities_list")["price"].agg(["sum", "count"])
#     total_price = df["price"].sum()
#     total_count = len(df)
#
#     # Calculate impacts in a vectorized way
#     price_impacts = []
#     for amenity in missing_amenities:
#         if amenity not in amenity_stats.index:
#             continue
#
#         sum_with, count_with = amenity_stats.loc[amenity]
#         if count_with == 0 or count_with == total_count:
#             continue
#
#         mean_with = sum_with / count_with
#         mean_without = (total_price - sum_with) / (total_count - count_with)
#
#         if mean_without <= 0:
#             continue
#
#         price_pct = (mean_with - mean_without) / mean_without * 100
#         price_impacts.append((amenity, price_pct))
#
#     # -------------------- Result Formatting -------------------- #
#     top_10 = sorted(price_impacts, key=lambda x: x[1], reverse=True)[:10]
#
#     return json.dumps(
#         {"top_10_missing_amenities_by_price": {k: f"{v:.2f}%" for k, v in top_10}},
#         indent=2,
#         ensure_ascii=False
#     )


# Sample usage
# df = pd.read_csv("general.csv")
# test = pd.read_csv("processed_test.csv")
# print(get_top_missing_amenities(df, test))



######################################################################################################


### importatn

import pandas as pd
import json
import re
import numpy as np
from ast import literal_eval
from calendar import monthrange
from pandas import to_datetime


# def get_top_missing_amenities(df, test):
#     # -------------------- Precompiled Patterns --------------------
#     pattern = re.compile(r"[\s\-:]+")
#     normalize = lambda x: pattern.sub(" ", x.strip().lower()).lower() if isinstance(x, str) else x
#
#     # -------------------- Date Processing --------------------
#     # Convert available_dates to datetime efficiently
#     df["available_dates"] = df["available_dates"].apply(
#         lambda x: [to_datetime(d) for d in literal_eval(x)] if pd.notna(x) else []
#     )
#
#     # Calculate occupied days using vectorized operations
#     date_df = df.explode("available_dates").reset_index()
#     date_df = date_df[date_df["available_dates"].notna()]
#
#     # Calculate date components
#     date_df["year"] = date_df["available_dates"].dt.year
#     date_df["month"] = date_df["available_dates"].dt.month
#     date_df["days_in_month"] = date_df.apply(
#         lambda x: monthrange(x.year, x.month)[1], axis=1
#     )
#
#     # Group with proper aggregation
#     available_counts = date_df.groupby(["index", "year", "month"]).agg(
#         available_days=("available_dates", "size"),
#         days_in_month=("days_in_month", "first")
#     ).reset_index()
#
#     # Calculate occupied days
#     available_counts["occupied_days"] = available_counts["days_in_month"] - available_counts["available_days"]
#
#     # Sum occupied days per listing
#     occupied_days = available_counts.groupby("index")["occupied_days"].sum()
#     df["total_occupied_days"] = occupied_days.reindex(df.index, fill_value=0)
#     df["total_revenue"] = df["price"] * df["total_occupied_days"]
#
#     # -------------------- Amenity Processing --------------------
#     # Process main dataframe amenities
#     def process_amenities(s):
#         try:
#             data = json.loads(s) if pd.notna(s) else []
#             return {
#                 normalize(item["name"])
#                 for group in data
#                 for item in group.get("items", [])
#                 if "name" in item
#             }
#         except (json.JSONDecodeError, TypeError):
#             return set()
#
#     df["amenities_set"] = df["amenities"].apply(process_amenities)
#
#     # Process test dataframe amenities
#     test["amenities_set"] = test["amenities_list"].apply(
#         lambda x: {normalize(i) for i in literal_eval(x)} if pd.notna(x) else set()
#     )
#
#     # -------------------- Missing Amenities --------------------
#     all_amenities = set().union(*df["amenities_set"])
#     test_amenities = set().union(*test["amenities_set"])
#     missing_amenities = all_amenities - test_amenities
#
#     # -------------------- Impact Calculation --------------------
#     # Explode once for both metrics
#     exploded = df.explode("amenities_set")[["amenities_set", "price", "total_revenue"]]
#
#     # Calculate statistics using vectorized operations
#     stats = exploded.groupby("amenities_set").agg(
#         price_sum=("price", "sum"),
#         price_count=("price", "count"),
#         revenue_sum=("total_revenue", "sum"),
#         revenue_count=("total_revenue", "count")
#     )
#
#     # Calculate price impact
#     total_price = df["price"].sum()
#     total_count = len(df)
#     stats["price_impact"] = (
#             (stats["price_sum"] / stats["price_count"] -
#              (total_price - stats["price_sum"]) / (total_count - stats["price_count"]))
#             / ((total_price - stats["price_sum"]) / (total_count - stats["price_count"]))
#             * 100
#     )
#
#     # Calculate revenue impact
#     total_revenue = df["total_revenue"].sum()
#     stats["revenue_impact"] = (
#             (stats["revenue_sum"] / stats["revenue_count"] -
#              (total_revenue - stats["revenue_sum"]) / (total_count - stats["revenue_count"]))
#             / ((total_revenue - stats["revenue_sum"]) / (total_count - stats["revenue_count"]))
#             * 100
#     )
#
#     # -------------------- Filter and Sort Results --------------------
#     def get_top_amenities(impact_series, n=10):
#         filtered = impact_series[impact_series.index.isin(missing_amenities)]
#         return (
#             filtered.replace([np.inf, -np.inf], np.nan)
#             .dropna()
#             .sort_values(ascending=False)
#             .head(n)
#             .to_dict()
#         )
#
#     price_impacts = get_top_amenities(stats["price_impact"])
#     revenue_impacts = get_top_amenities(stats["revenue_impact"])
#
#     return json.dumps({
#         "top_10_missing_by_price": {k: f"{v:.2f}%" for k, v in price_impacts.items()},
#         "top_10_missing_by_revenue": {k: f"{v:.2f}%" for k, v in revenue_impacts.items()}
#     }, indent=2, ensure_ascii=False)


#
# df = pd.read_csv("general.csv")
# test = pd.read_csv("processed_test.csv")
# print(get_top_missing_amenities(df, test))

#####################################################################################################################

from fastapi import FastAPI, UploadFile, File, BackgroundTasks, HTTPException
from fastapi.responses import JSONResponse
import pandas as pd
import numpy as np
import re
import json
from ast import literal_eval
from calendar import monthrange
from pandas import to_datetime
from io import StringIO
import uuid

app = FastAPI()

# In-memory job tracker (for demo purposes only)
# Global variable to hold the processed result.
job_result = {"status": "not_started", "result": None, "job_id": None}

# df = pd.read_csv("general.csv")
# test = pd.read_csv("processed_test.csv")

# # -------------------- Your Processing Function --------------------

# def get_top_missing_amenities(df, test):
#     # -------------------- Precompiled Patterns --------------------
#     pattern = re.compile(r"[\s\-:]+")
#     normalize = lambda x: pattern.sub(" ", x.strip().lower()).lower() if isinstance(x, str) else x

#     # -------------------- Date Processing --------------------
#     # Convert available_dates to datetime efficiently
#     df["available_dates"] = df["available_dates"].apply(
#         lambda x: [to_datetime(d) for d in literal_eval(x)] if pd.notna(x) else []
#     )

#     # Calculate occupied days using vectorized operations
#     date_df = df.explode("available_dates").reset_index()
#     date_df = date_df[date_df["available_dates"].notna()]

#     # Calculate date components
#     date_df["year"] = date_df["available_dates"].dt.year
#     date_df["month"] = date_df["available_dates"].dt.month
#     date_df["days_in_month"] = date_df.apply(
#         lambda x: monthrange(x.year, x.month)[1], axis=1
#     )

#     # Group with proper aggregation
#     available_counts = date_df.groupby(["index", "year", "month"]).agg(
#         available_days=("available_dates", "size"),
#         days_in_month=("days_in_month", "first")
#     ).reset_index()

#     # Calculate occupied days
#     available_counts["occupied_days"] = available_counts["days_in_month"] - available_counts["available_days"]

#     # Sum occupied days per listing
#     occupied_days = available_counts.groupby("index")["occupied_days"].sum()
#     df["total_occupied_days"] = occupied_days.reindex(df.index, fill_value=0)
#     df["total_revenue"] = df["price"] * df["total_occupied_days"]

#     # -------------------- Amenity Processing --------------------
#     # Process main dataframe amenities
#     def process_amenities(s):
#         try:
#             data = json.loads(s) if pd.notna(s) else []
#             return {
#                 normalize(item["name"])
#                 for group in data
#                 for item in group.get("items", [])
#                 if "name" in item
#             }
#         except (json.JSONDecodeError, TypeError):
#             return set()

#     df["amenities_set"] = df["amenities"].apply(process_amenities)

#     # Process test dataframe amenities
#     test["amenities_set"] = test["amenities_list"].apply(
#         lambda x: {normalize(i) for i in literal_eval(x)} if pd.notna(x) else set()
#     )

#     # -------------------- Missing Amenities --------------------
#     all_amenities = set().union(*df["amenities_set"])



#     test_amenities = set().union(*test["amenities_set"])
#     missing_amenities = all_amenities - test_amenities

#     # -------------------- Impact Calculation --------------------
#     # Explode once for both metrics
#     exploded = df.explode("amenities_set")[["amenities_set", "price", "total_revenue"]]

#     # Calculate statistics using vectorized operations
#     stats = exploded.groupby("amenities_set").agg(
#         price_sum=("price", "sum"),
#         price_count=("price", "count"),
#         revenue_sum=("total_revenue", "sum"),
#         revenue_count=("total_revenue", "count")
#     )

#     # Calculate price impact
#     total_price = df["price"].sum()
#     total_count = len(df)
#     stats["price_impact"] = (
#             (stats["price_sum"] / stats["price_count"] -
#              (total_price - stats["price_sum"]) / (total_count - stats["price_count"]))
#             / ((total_price - stats["price_sum"]) / (total_count - stats["price_count"]))
#             * 100
#     )

#     # Calculate revenue impact
#     total_revenue = df["total_revenue"].sum()
#     stats["revenue_impact"] = (
#             (stats["revenue_sum"] / stats["revenue_count"] -
#              (total_revenue - stats["revenue_sum"]) / (total_count - stats["revenue_count"]))
#             / ((total_revenue - stats["revenue_sum"]) / (total_count - stats["revenue_count"]))
#             * 100
#     )

#     # -------------------- Filter and Sort Results --------------------
#     def get_top_amenities(impact_series, n=10):
#         filtered = impact_series[impact_series.index.isin(missing_amenities)]
#         return (
#             filtered.replace([np.inf, -np.inf], np.nan)
#             .dropna()
#             .sort_values(ascending=False)
#             .head(n)
#             .to_dict()
#         )

#     price_impacts = get_top_amenities(stats["price_impact"])
#     revenue_impacts = get_top_amenities(stats["revenue_impact"])

#     return json.dumps({
#         "top_10_missing_by_price": {k: f"{v:.2f}%" for k, v in price_impacts.items()},
#         "top_10_missing_by_revenue": {k: f"{v:.2f}%" for k, v in revenue_impacts.items()}
#     }, indent=2, ensure_ascii=False)



# ----------------------- final top 10 amenities code ---------------------


# def get_top_amenities(df, test):
#     # -------------------- Precompiled Patterns --------------------
#     pattern = re.compile(r"[\s\-:]+")
#     normalize = lambda x: pattern.sub(" ", x.strip().lower()).lower() if isinstance(x, str) else x
#
#     # -------------------- Date Processing --------------------
#     # Convert available_dates to datetime efficiently
#     df["available_dates"] = df["available_dates"].apply(
#         lambda x: [to_datetime(d) for d in literal_eval(x)] if pd.notna(x) else []
#     )
#
#     # Calculate occupied days using vectorized operations
#     date_df = df.explode("available_dates").reset_index()
#     date_df = date_df[date_df["available_dates"].notna()]
#
#     # Calculate date components
#     date_df["year"] = date_df["available_dates"].dt.year
#     date_df["month"] = date_df["available_dates"].dt.month
#     date_df["days_in_month"] = date_df.apply(
#         lambda x: monthrange(x.year, x.month)[1], axis=1
#     )
#
#     # Group with proper aggregation
#     available_counts = date_df.groupby(["index", "year", "month"]).agg(
#         available_days=("available_dates", "size"),
#         days_in_month=("days_in_month", "first")
#     ).reset_index()
#
#     # Calculate occupied days
#     available_counts["occupied_days"] = available_counts["days_in_month"] - available_counts["available_days"]
#
#     # Sum occupied days per listing
#     occupied_days = available_counts.groupby("index")["occupied_days"].sum()
#     df["total_occupied_days"] = occupied_days.reindex(df.index, fill_value=0)
#
#     # No longer calculating total_revenue, using existing column directly
#
#     # -------------------- Amenity Processing --------------------
#     # Process main dataframe amenities
#     def process_amenities(s):
#         try:
#             data = json.loads(s) if pd.notna(s) else []
#             return {
#                 normalize(item["name"])
#                 for group in data
#                 for item in group.get("items", [])
#                 if "name" in item
#             }
#         except (json.JSONDecodeError, TypeError):
#             return set()
#
#     df["amenities_set"] = df["amenities"].apply(process_amenities)
#
#     # Process test dataframe amenities
#     test["amenities_set"] = test["amenities_list"].apply(
#         lambda x: {normalize(i) for i in literal_eval(x)} if pd.notna(x) else set()
#     )
#
#     # -------------------- Missing Amenities --------------------
#     all_amenities = set().union(*df["amenities_set"])
#     test_amenities = set().union(*test["amenities_set"])
#     missing_amenities = all_amenities - test_amenities
#
#     # -------------------- Impact Calculation --------------------
#     # Explode once for both metrics
#     exploded = df.explode("amenities_set")[["amenities_set", "price", "total_revenue"]]
#
#     # Calculate statistics using vectorized operations
#     stats = exploded.groupby("amenities_set").agg(
#         price_sum=("price", "sum"),
#         price_count=("price", "count"),
#         revenue_sum=("total_revenue", "sum"),
#         revenue_count=("total_revenue", "count")
#     )
#
#     # Calculate price impact
#     total_price = df["price"].sum()
#     total_count = len(df)
#     stats["price_impact"] = (
#             (stats["price_sum"] / stats["price_count"] -
#              (total_price - stats["price_sum"]) / (total_count - stats["price_count"]))
#             / ((total_price - stats["price_sum"]) / (total_count - stats["price_count"]))
#             * 100
#     )
#
#     # Calculate revenue impact
#     total_revenue = df["total_revenue"].sum()
#     stats["revenue_impact"] = (
#             (stats["revenue_sum"] / stats["revenue_count"] -
#              (total_revenue - stats["revenue_sum"]) / (total_count - stats["revenue_count"]))
#             / ((total_revenue - stats["revenue_sum"]) / (total_count - stats["revenue_count"]))
#             * 100
#     )
#
#     # -------------------- Filter and Sort Results --------------------
#     def get_top_amenities(impact_series, n=10):
#         filtered = impact_series[impact_series.index.isin(missing_amenities)]
#         return (
#             filtered.replace([np.inf, -np.inf], np.nan)
#             .dropna()
#             .sort_values(ascending=False)
#             .head(n)
#             .to_dict()
#         )
#
#     price_impacts = get_top_amenities(stats["price_impact"])
#     revenue_impacts = get_top_amenities(stats["revenue_impact"])
#
#     return json.dumps({
#         "top_10_missing_by_price": {k: f"{v:.2f}%" for k, v in price_impacts.items()},
#         "top_10_missing_by_revenue": {k: f"{v:.2f}%" for k, v in revenue_impacts.items()}
#     }, indent=2, ensure_ascii=False)


# -------------------- Background Task --------------------



###############################################################################################

import pandas as pd
import re
import json
import numpy as np
from ast import literal_eval


def get_top_amenities(df, test_df):
    """
    Analyze top missing amenities by price and revenue impact.

    Parameters:
    - df: pandas DataFrame - Main data loaded from SQL database
    - test_df: pandas DataFrame - Test data loaded from CSV

    Returns:
    - JSON string with top 10 missing amenities by price and revenue impact
    """
    # -------------------- Precompiled Patterns --------------------
    pattern = re.compile(r"[\s\-:]+")
    normalize = lambda x: pattern.sub(" ", x.strip().lower()).lower() if isinstance(x, str) else x

    # -------------------- Debug Info --------------------
    print(f"Main DataFrame columns: {df.columns.tolist()}")
    print(f"Test DataFrame columns: {test_df.columns.tolist()}")

    # -------------------- Amenity Processing --------------------
    # Process main dataframe amenities
    def process_amenities(s):
        try:
            if pd.isna(s):
                return set()

            if isinstance(s, dict) or isinstance(s, list):
                data = s
            elif isinstance(s, str):
                # Try JSON parsing first
                try:
                    data = json.loads(s)
                except json.JSONDecodeError:
                    # If not JSON, it might be something else
                    print(f"JSON parsing failed for amenities")
                    return set()
            else:
                return set()

            if isinstance(data, list):
                result = set()
                for group in data:
                    if not isinstance(group, dict):
                        continue
                    items = group.get("items", [])
                    if not isinstance(items, list):
                        continue
                    for item in items:
                        if isinstance(item, dict) and "name" in item:
                            result.add(normalize(item["name"]))
                return result
            return set()
        except Exception as e:
            print(f"Error processing amenity: {e}")
            return set()

    # Process test dataframe amenities
    def process_test_amenities(x):
        try:
            if pd.isna(x):
                return set()

            if isinstance(x, list):
                return {normalize(i) for i in x}

            if isinstance(x, str):
                # Check if it looks like a Python list literal
                if x.startswith('[') and x.endswith(']'):
                    try:
                        parsed = literal_eval(x)
                        if isinstance(parsed, list):
                            return {normalize(i) for i in parsed}
                    except (ValueError, SyntaxError):
                        pass

                # Try JSON parsing
                try:
                    parsed = json.loads(x)
                    if isinstance(parsed, list):
                        return {normalize(i) for i in parsed}
                except json.JSONDecodeError:
                    pass

                # If all else fails, treat as comma-separated
                return {normalize(i.strip()) for i in x.split(',') if i.strip()}

            return set()
        except Exception as e:
            print(f"Error processing test amenity: {e}")
            return set()

    print("Processing main amenities...")
    df["amenities_set"] = df["amenities"].apply(process_amenities)

    # Process test dataframe amenities - handle both 'amenities' or 'amenities_list' columns
    amenities_column = 'amenities_list' if 'amenities_list' in test_df.columns else 'amenities'
    print(f"Processing test amenities from column: {amenities_column}")
    test_df["amenities_set"] = test_df[amenities_column].apply(process_test_amenities)

    # -------------------- Missing Amenities --------------------
    all_amenities = set().union(*df["amenities_set"].values) if len(df) > 0 and not df["amenities_set"].empty else set()
    test_amenities = set().union(*test_df["amenities_set"].values) if len(test_df) > 0 and not test_df[
        "amenities_set"].empty else set()
    missing_amenities = all_amenities - test_amenities

    print(f"Total unique amenities in main data: {len(all_amenities)}")
    print(f"Total unique amenities in test data: {len(test_amenities)}")
    print(f"Total missing amenities: {len(missing_amenities)}")

    # -------------------- Impact Calculation --------------------
    try:
        # Ensure required columns exist
        for col in ["price", "total_revenue"]:
            if col not in df.columns:
                print(f"Creating missing column: {col}")
                df[col] = 0

        # Explode once for both metrics
        print("Exploding amenities for impact calculation...")
        exploded = df.explode("amenities_set")[["amenities_set", "price", "total_revenue"]].copy()
        exploded = exploded[exploded["amenities_set"].notna() & (exploded["amenities_set"] != "")]

        # Calculate statistics using vectorized operations
        print("Calculating group statistics...")
        stats = exploded.groupby("amenities_set").agg(
            price_sum=("price", "sum"),
            price_count=("price", "count"),
            revenue_sum=("total_revenue", "sum"),
            revenue_count=("total_revenue", "count")
        )

        # Calculate price impact
        total_price = df["price"].sum()
        total_count = len(df)

        # Avoid division by zero
        def calculate_price_impact(row):
            try:
                if (total_count - row["price_count"]) <= 0:
                    return 0

                avg_with_amenity = row["price_sum"] / row["price_count"]
                avg_without_amenity = (total_price - row["price_sum"]) / (total_count - row["price_count"])

                if avg_without_amenity == 0:
                    return 0

                return ((avg_with_amenity - avg_without_amenity) / avg_without_amenity) * 100
            except Exception as e:
                print(f"Error calculating price impact: {e}")
                return 0

        print("Calculating price impact...")
        stats["price_impact"] = stats.apply(calculate_price_impact, axis=1)

        # Calculate revenue impact
        total_revenue = df["total_revenue"].sum()

        def calculate_revenue_impact(row):
            try:
                if (total_count - row["revenue_count"]) <= 0:
                    return 0

                avg_with_amenity = row["revenue_sum"] / row["revenue_count"]
                avg_without_amenity = (total_revenue - row["revenue_sum"]) / (total_count - row["revenue_count"])

                if avg_without_amenity == 0:
                    return 0

                return ((avg_with_amenity - avg_without_amenity) / avg_without_amenity) * 100
            except Exception as e:
                print(f"Error calculating revenue impact: {e}")
                return 0

        print("Calculating revenue impact...")
        stats["revenue_impact"] = stats.apply(calculate_revenue_impact, axis=1)

        # -------------------- Filter and Sort Results --------------------
        def get_top_amenities_list(impact_series, n=10):
            filtered = impact_series[impact_series.index.isin(missing_amenities)]
            return (
                filtered.replace([np.inf, -np.inf], np.nan)
                .dropna()
                .sort_values(ascending=False)
                .head(n)
                .to_dict()
            )

        print("Finding top amenities...")
        price_impacts = get_top_amenities_list(stats["price_impact"])
        revenue_impacts = get_top_amenities_list(stats["revenue_impact"])

    except Exception as e:
        print(f"Error calculating impacts: {e}")
        import traceback
        traceback.print_exc()
        price_impacts = {}
        revenue_impacts = {}

    print("Generating final JSON response...")
    return json.dumps({
        "top_10_missing_by_price": {k: f"{v:.2f}%" for k, v in price_impacts.items()},
        "top_10_missing_by_revenue": {k: f"{v:.2f}%" for k, v in revenue_impacts.items()}
    }, indent=2, ensure_ascii=False)


# Example usage:
"""
# In your main script:
import pandas as pd
import mysql.connector

# Connect to MySQL and fetch main data
connection = mysql.connector.connect(
    host='127.0.0.1',
    user='root',
    password='',
    database='pricing'
)

# Get main data from MySQL
query = "SELECT * FROM rental_properties"
df = pd.read_sql_query(query, connection)
connection.close()

# Get test data from CSV
test_df = pd.read_csv('test_data.csv')

# Call the function
result = get_top_amenities(df, test_df)
print(result)
"""


##############################################################################################

# async def run_processing():
#     global job_result
#     job_id = str(uuid.uuid4())
#     job_result["job_id"] = job_id
#     job_result["status"] = "processing"
#     try:
#         # Optionally simulate a delay with: await asyncio.sleep(5)
#         result = get_top_missing_amenities(df, test)
#         job_result["result"] = result
#         job_result["status"] = "completed"
#     except Exception as e:
#         job_result["status"] = "failed"
#         job_result["error"] = str(e)

# -------------------- API Endpoints --------------------

# @app.post("/process")
# async def initiate_processing(background_tasks: BackgroundTasks):
#     """
#     Initiate processing using the pre-scraped DataFrames.
#     No payload is required; the global variables are used.
#     """
#     job_id = str(uuid.uuid4())
#     jobs[job_id] = {"status": "processing", "result": None}
#     background_tasks.add_task(process_job, job_id)
#     return {"job_id": job_id, "status": "processing"}
#
# @app.get("/status/{job_id}")
# async def get_status(job_id: str):
#     """
#     Retrieve the processing status and result for the given job_id.
#     """
#     if job_id not in jobs:
#         raise HTTPException(status_code=404, detail="Job not found")
#     return jobs[job_id]


# Trigger processing on startup (or schedule this externally as needed)
# @app.on_event("startup")
# async def startup_event():
#     await run_processing()

# GET endpoint for the frontend to fetch the processed result.
@app.get("/status")
async def get_status():
    # If processing is complete, instruct caches to keep this response for 3 months (~7,884,000 seconds)
    headers = {"Cache-Control": "public, max-age=7884000"} if job_result.get("status") == "completed" else {"Cache-Control": "no-cache"}
    print(json.dumps(job_result))
    return JSONResponse(content=job_result, headers=headers)








