# import random
# import pandas as pd
# from geopy.distance import geodesic
# import h3
# import psutil
# import mysql.connector  # Import mysql.connector for MySQL connection
# from mysql.connector import Error
#
#
# def process_dataframe(df: pd.DataFrame, lat: float, long: float) -> pd.DataFrame:
#     # -------------------- Maps Integration & H3 Aggregation -------------------- #
#
#     # -------------------- Retrieve Nearby Listings -------------------- #
#     def find_nearest_entry(dataframe, lat, long):
#         dataframe["distance"] = dataframe.apply(
#             lambda row: geodesic((lat, long), (row["lat"], row["long"])).meters, axis=1
#         )
#         min_distance = dataframe["distance"].min()
#         nearest_entries = dataframe[dataframe["distance"] == min_distance]
#         # If multiple entries have the same distance, sort by color_rank descending and take the first
#         nearest_entry = nearest_entries.sort_values("color_rank", ascending=False).iloc[0]
#         return nearest_entry, min_distance
#
#     def get_hexagon_entries(dataframe, h3_index):
#         return dataframe[dataframe["h3_index"] == h3_index].copy()
#
#     def get_nearby_entries(dataframe, h3_index, target_color_rank, current_count, max_needed):
#         collected_entries = pd.DataFrame()
#         k = 1  # start with direct neighbors
#         max_search_k = 10  # limit to 10 rings
#
#         while len(collected_entries) + current_count < max_needed and k <= max_search_k:
#             nearby_hexagons = list(h3.k_ring(h3_index, k))
#             nearby_df = dataframe[dataframe["h3_index"].isin(nearby_hexagons)].copy()
#
#             if not nearby_df.empty:
#                 nearby_df["color_rank_diff"] = abs(nearby_df["color_rank"] - target_color_rank)
#                 nearby_df = nearby_df.sort_values(["color_rank", "color_rank_diff"], ascending=[False, True])
#                 additional_needed = max_needed - (len(collected_entries) + current_count)
#                 collected_entries = pd.concat([collected_entries, nearby_df.head(additional_needed)])
#
#             k += 1  # expand search range
#         return collected_entries.head(max_needed - current_count)
#
#     def retrieve_entries(dataframe, lat, long):
#         nearest_entry, distance = find_nearest_entry(dataframe, lat, long)
#         h3_index = nearest_entry["h3_index"]
#         color_rank = nearest_entry["color_rank"]
#
#         # Get all entries within the main hexagon
#         hexagon_entries = get_hexagon_entries(dataframe, h3_index)
#
#         # Dynamically fetch additional entries until we have the desired count (set here as 200)
#         while len(hexagon_entries) < 200:
#             required_entries = 200 - len(hexagon_entries)
#             additional_entries = get_nearby_entries(dataframe, h3_index, color_rank, len(hexagon_entries), 200)
#             if additional_entries.empty:
#                 break
#             hexagon_entries = pd.concat([hexagon_entries, additional_entries]).head(200)
#
#         # Print some information about the nearest entry and hexagon
#         hex_centroid = h3.h3_to_geo(h3_index)
#         print(f"Distance to nearest entry: {distance:.2f} meters")
#         print(f"Hexagon centroid: {hex_centroid}")
#         print(f"Final dataset size: {len(hexagon_entries)}")
#
#         return hexagon_entries
#
#     # Retrieve entries based on the given lat-long
#     final_df = retrieve_entries(df, lat, long)
#     print(final_df.shape)
#
#     # # Optionally save the final dataframe to CSV:
#     # final_df.to_csv("doesit.csv", index=False)
#     # print("general_processed.csv has been saved successfully!")
#
#     return final_df
#
#
#
# # try:
# #     connection = mysql.connector.connect(
# #         host='127.0.0.1',
# #         user='root',
# #         password='',  # Using XAMPP default: no password
# #         database='pricing'
# #     )
# #     if connection.is_connected():
# #         print("Connected to the MySQL database for rental_properties data")
# #         # Read data from the 'general' table
# #         general = pd.read_sql_query("SELECT * FROM rental_properties", connection)
# #
# #     else:
# #         raise Exception("Failed to connect to the MySQL database")
# # except Error as e:
# #     print(f"Error connecting to MySQL: {e}")
# #     raise e
# # finally:
# #     if connection.is_connected():
# #         connection.close()
# #         print("MySQL connection closed")
# #
# #
# # final_df = process_dataframe(general, lat=42.1713, long=-73.9698)
# # print(final_df.dtypes)
# # print(general.dtypes)





####################################################################################################################


# import pandas as pd
# import h3
# from geopy.distance import geodesic
# from typing import Tuple, Optional
# import random  # Only for example data if needed, not core logic
#
# # --- Configuration Constants ---
# H3_RESOLUTION = 8  # Should match the resolution used in process_airbnb_data.py
# TARGET_COMPARABLES_COUNT = 250  # Aim for this many comparables
# MAX_K_RING_SEARCH = 7  # Max k-ring distance to search for neighbors
# DEFAULT_BEDROOM_SIMILARITY_WEIGHT = 0.4  # Weight for bedroom similarity in sorting
# DEFAULT_COLOR_RANK_SIMILARITY_WEIGHT = 0.6  # Weight for color rank similarity
#
#
# def get_anchor_details(
#     df_global: pd.DataFrame, test_lat: float, test_long: float
# ) -> Tuple[Optional[str], Optional[int], float]:
#     """
#     Determines the anchor H3 index and color rank for the search.
#     Prioritizes the test property's own H3 cell if data exists there.
#     Otherwise, falls back to the geographically nearest listing.
#
#     Returns:
#         tuple: (anchor_h3_index, anchor_color_rank, distance_to_anchor_point_meters)
#                Returns (None, None, float('inf')) if df_global is empty.
#     """
#     if df_global.empty:
#         print("Warning: Global dataframe is empty. Cannot determine anchor details.")
#         return None, None, float('inf')
#
#     test_h3_index = h3.geo_to_h3(test_lat, test_long, H3_RESOLUTION)
#     listings_in_test_h3 = df_global[df_global["h3_index"] == test_h3_index]
#
#     if not listings_in_test_h3.empty:
#         # Use the test property's own H3 cell if it has listings
#         # color_rank should be consistent for all listings in the same h3_index
#         anchor_h3_index = test_h3_index
#         anchor_color_rank = listings_in_test_h3["color_rank"].iloc[0]
#         # Distance to a point within its own H3 cell is effectively 0 for anchor purposes
#         distance_to_anchor_point_meters = 0.0
#         print(f"Anchor based on test property's H3 cell: {anchor_h3_index}, Color Rank: {anchor_color_rank}")
#     else:
#         # Fallback: Test property's H3 cell is empty or not in data, find geographically nearest
#         print(
#             f"Warning: Test property's H3 cell ({test_h3_index}) is empty or not in global data. Finding nearest listing.")
#         df_global["distance_to_test_loc"] = df_global.apply(
#             lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters,
#             axis=1,
#         )
#         nearest_entry = df_global.sort_values("distance_to_test_loc").iloc[0]
#         anchor_h3_index = nearest_entry["h3_index"]
#         anchor_color_rank = nearest_entry["color_rank"]
#         distance_to_anchor_point_meters = nearest_entry["distance_to_test_loc"]
#         df_global.drop(columns=["distance_to_test_loc"], inplace=True, errors='ignore')
#         print(
#             f"Anchor based on nearest listing: H3 {anchor_h3_index}, Color Rank {anchor_color_rank}, Distance: {distance_to_anchor_point_meters:.2f}m")
#
#     return anchor_h3_index, anchor_color_rank, distance_to_anchor_point_meters
#
#
# def select_comparables_for_pricing(
#         df_global_processed: pd.DataFrame,
#         test_property_details: pd.DataFrame,  # Single-row DataFrame for the property being priced
#         target_count: int = TARGET_COMPARABLES_COUNT,
# ) -> pd.DataFrame:
#     """
#     Selects comparable listings for a given test property.
#
#     Args:
#         df_global_processed: The fully processed global DataFrame of all listings.
#                              Must contain 'id', 'lat', 'long', 'price', 'bedrooms',
#                              'h3_index', 'color_rank'.
#         test_property_details: A single-row DataFrame with details of the property to price.
#                                Must contain 'lat', 'long', and optionally 'bedrooms'.
#         target_count: The desired number of comparable listings.
#
#     Returns:
#         A DataFrame of selected comparable listings.
#     """
#     if df_global_processed.empty:
#         print("Error: Global processed DataFrame is empty. Cannot select comparables.")
#         return pd.DataFrame()
#     if test_property_details.empty:
#         print("Error: Test property details are empty. Cannot select comparables.")
#         return pd.DataFrame()
#
#     # Make a copy to avoid modifying the original global DataFrame
#     df_search_pool = df_global_processed.copy()
#
#     test_lat = test_property_details["lat"].iloc[0]
#     test_long = test_property_details["long"].iloc[0]
#     test_bedrooms = (
#         test_property_details["bedrooms"].iloc[0]
#         if "bedrooms" in test_property_details.columns and pd.notna(test_property_details["bedrooms"].iloc[0])
#         else None
#     )
#
#     anchor_h3_index, anchor_color_rank, _ = get_anchor_details(
#         df_search_pool, test_lat, test_long
#     )
#
#     if anchor_h3_index is None:
#         print("Error: Could not determine anchor H3 details. Returning empty DataFrame.")
#         return pd.DataFrame()
#
#     collected_listings_df = pd.DataFrame()
#
#     # Tier 1: Listings in the anchor H3 cell
#     print(f"\n--- Tier 1: Searching anchor H3 cell: {anchor_h3_index} ---")
#     listings_in_anchor_h3 = df_search_pool[df_search_pool["h3_index"] == anchor_h3_index].copy()
#
#     if not listings_in_anchor_h3.empty:
#         # Add distance from test property for finer sorting within the anchor cell
#         listings_in_anchor_h3["distance_to_test"] = listings_in_anchor_h3.apply(
#             lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#         )
#         sort_cols_tier1 = ["distance_to_test"]
#         ascending_tier1 = [True]
#
#         if test_bedrooms is not None:
#             listings_in_anchor_h3["bedroom_diff"] = abs(
#                 listings_in_anchor_h3["bedrooms"] - test_bedrooms
#             )
#             sort_cols_tier1.insert(0, "bedroom_diff")  # Prioritize bedroom similarity
#             ascending_tier1.insert(0, True)
#
#         listings_in_anchor_h3 = listings_in_anchor_h3.sort_values(
#             by=sort_cols_tier1, ascending=ascending_tier1
#         )
#         collected_listings_df = pd.concat([collected_listings_df, listings_in_anchor_h3])
#         # Ensure 'id' is unique if somehow duplicates were introduced (unlikely here)
#         collected_listings_df = collected_listings_df.drop_duplicates(subset=["id"])
#
#     print(f"Collected {len(collected_listings_df)} from anchor H3 cell.")
#
#     # Tier 2: Expand to k-rings if more listings are needed
#     k = 1
#     # Keep track of H3 cells already queried to avoid redundant checks (though isin and id checks handle data)
#     processed_h3_indices = {anchor_h3_index}
#
#     while len(collected_listings_df) < target_count and k <= MAX_K_RING_SEARCH:
#         print(
#             f"\n--- Tier 2: Collected {len(collected_listings_df)} listings. Need {target_count}. Searching k-ring: {k} ---")
#
#         # Get H3 indices in the current k-ring
#         # h3.k_ring returns the center cell as well, so we need to be careful or filter
#         current_ring_indices = set(h3.k_ring(anchor_h3_index, k))
#         new_indices_to_search = list(current_ring_indices - processed_h3_indices)
#
#         if not new_indices_to_search:
#             print(f"No new H3 indices to search in k-ring {k}.")
#             k += 1
#             processed_h3_indices.update(current_ring_indices)  # Mark current ring as processed
#             continue
#
#         print(f"Searching {len(new_indices_to_search)} new H3 cells in k-ring {k}.")
#
#         potential_neighbors_df = df_search_pool[
#             df_search_pool["h3_index"].isin(new_indices_to_search)
#         ].copy()
#
#         # Exclude listings already collected
#         potential_neighbors_df = potential_neighbors_df[
#             ~potential_neighbors_df["id"].isin(collected_listings_df["id"])
#         ]
#
#         if not potential_neighbors_df.empty:
#             potential_neighbors_df["color_rank_diff"] = abs(
#                 potential_neighbors_df["color_rank"] - anchor_color_rank
#             )
#             # Add distance from test property for finer sorting
#             potential_neighbors_df["distance_to_test"] = potential_neighbors_df.apply(
#                 lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#             )
#
#             sort_cols_tier2 = ["color_rank_diff", "distance_to_test"]
#             ascending_tier2 = [True, True]  # Smaller diff, smaller distance
#
#             if test_bedrooms is not None:
#                 potential_neighbors_df["bedroom_diff"] = abs(
#                     potential_neighbors_df["bedrooms"] - test_bedrooms
#                 )
#                 sort_cols_tier2.insert(1, "bedroom_diff")  # color_rank_diff, then bedroom_diff, then distance
#                 ascending_tier2.insert(1, True)
#
#             # Optional: Add a slight preference for higher color_rank if diffs are similar
#             # sort_cols_tier2.append("color_rank")
#             # ascending_tier2.append(False) # Higher color_rank preferred
#
#             potential_neighbors_df = potential_neighbors_df.sort_values(
#                 by=sort_cols_tier2, ascending=ascending_tier2
#             )
#
#             needed_now = target_count - len(collected_listings_df)
#             collected_listings_df = pd.concat(
#                 [collected_listings_df, potential_neighbors_df.head(needed_now)]
#             )
#             collected_listings_df = collected_listings_df.drop_duplicates(subset=["id"])
#             print(
#                 f"Added {len(potential_neighbors_df.head(needed_now))} listings from k-ring {k}. Total collected: {len(collected_listings_df)}")
#         else:
#             print(f"No new, uncollected listings found in H3 cells for k-ring {k}.")
#
#         processed_h3_indices.update(current_ring_indices)  # Mark current ring as processed
#         if len(collected_listings_df) >= target_count:
#             break
#         k += 1
#
#     final_selected_listings = collected_listings_df.head(target_count)
#     print(f"\nFinal selected dataset size for pricing: {len(final_selected_listings)}")
#     if len(final_selected_listings) < target_count and target_count > 0:  # Added target_count > 0
#         print(
#             f"Warning: Could only retrieve {len(final_selected_listings)} listings, "
#             f"less than target {target_count}."
#         )
#     elif target_count == 0:
#         print("Warning: Target count was 0, no listings selected.")
#
#     # Clean up temporary columns from the selection
#     cols_to_drop_from_final = ['distance_to_test', 'bedroom_diff', 'color_rank_diff']
#     final_selected_listings = final_selected_listings.drop(
#         columns=[col for col in cols_to_drop_from_final if col in final_selected_listings.columns],
#         errors='ignore'
#     )
#
#     return final_selected_listings

########################################################################################################################


# import pandas as pd
# import h3
# from geopy.distance import geodesic
# from typing import Tuple, Optional
#
# # import random # Only if you add an if __name__ == "__main__": block that uses it
#
# # --- Configuration Constants ---
# H3_RESOLUTION = 8  # Should match the resolution used in process_airbnb_data.py
# TARGET_COMPARABLES_COUNT = 250  # Aim for this many comparables
# MAX_K_RING_SEARCH = 7  # Max k-ring distance to search for neighbors in select_comparables_for_pricing
# MAX_INITIAL_ANCHOR_SEARCH_K_RING = 2  # How far to look for a feature-similar anchor initially in get_anchor_details
#
#
# # DEFAULT_BEDROOM_SIMILARITY_WEIGHT = 0.4 # Not directly used with current sorting, but good for reference
# # DEFAULT_COLOR_RANK_SIMILARITY_WEIGHT = 0.6 # Not directly used with current sorting
#
# def get_anchor_details(
#         df_global: pd.DataFrame,
#         test_lat: float,
#         test_long: float,
#         test_bedrooms: Optional[int]
# ) -> Tuple[Optional[str], Optional[int], float]:
#     """
#     Determines the anchor H3 index and color rank for the search.
#     Priority:
#     1. Listings in test property's own H3 cell, prioritizing bedroom similarity.
#     2. If own H3 is empty or no good bedroom match, search immediate k-rings (up to MAX_INITIAL_ANCHOR_SEARCH_K_RING)
#        for the listing with the smallest bedroom difference, then closest distance.
#     3. If still no good anchor, fall back to the absolute geographically nearest listing.
#
#     Returns:
#         tuple: (anchor_h3_index, anchor_color_rank, distance_to_anchor_point_meters)
#                Returns (None, None, float('inf')) if df_global is empty or critical columns missing.
#     """
#     if df_global.empty:
#         print("Warning: Global dataframe is empty. Cannot determine anchor details.")
#         return None, None, float('inf')
#
#     required_cols = ['h3_index', 'color_rank', 'lat', 'long', 'bedrooms', 'id']
#     if not all(col in df_global.columns for col in required_cols):
#         missing = [col for col in required_cols if col not in df_global.columns]
#         print(f"Error: df_global missing required columns for get_anchor_details: {missing}")
#         return None, None, float('inf')
#
#     test_h3_index = h3.geo_to_h3(test_lat, test_long, H3_RESOLUTION)
#
#     anchor_h3_index: Optional[str] = None
#     anchor_color_rank: Optional[int] = None
#     distance_to_anchor_point_meters: float = float('inf')
#     anchor_found_method = "None"
#
#     # Strategy 1: Check test property's own H3 cell
#     listings_in_test_h3 = df_global[df_global["h3_index"] == test_h3_index].copy()  # Use .copy()
#     if not listings_in_test_h3.empty:
#         if test_bedrooms is not None:
#             listings_in_test_h3["bedroom_diff"] = abs(listings_in_test_h3["bedrooms"] - test_bedrooms)
#             listings_in_test_h3["distance_to_test"] = listings_in_test_h3.apply(
#                 lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#             )
#             best_in_own_h3 = listings_in_test_h3.sort_values(["bedroom_diff", "distance_to_test"]).iloc[0]
#             if best_in_own_h3["bedroom_diff"] <= 3:
#                 anchor_h3_index = best_in_own_h3["h3_index"]
#                 anchor_color_rank = int(best_in_own_h3["color_rank"]) if pd.notna(
#                     best_in_own_h3["color_rank"]) else None
#                 distance_to_anchor_point_meters = best_in_own_h3["distance_to_test"]
#                 anchor_found_method = "Own H3 (good bedroom match)"
#
#         if anchor_h3_index is None:
#             if "distance_to_test" not in listings_in_test_h3.columns:
#                 listings_in_test_h3["distance_to_test"] = listings_in_test_h3.apply(
#                     lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#                 )
#             closest_in_own_h3 = listings_in_test_h3.sort_values("distance_to_test").iloc[0]
#             anchor_h3_index = closest_in_own_h3["h3_index"]
#             anchor_color_rank = int(closest_in_own_h3["color_rank"]) if pd.notna(
#                 closest_in_own_h3["color_rank"]) else None
#             distance_to_anchor_point_meters = closest_in_own_h3["distance_to_test"]
#             anchor_found_method = "Own H3 (closest)"
#
#     # Strategy 2: If own H3 was empty or didn't yield a satisfactory anchor, and test_bedrooms is known
#     if anchor_h3_index is None and test_bedrooms is not None:
#         print(
#             f"Test property's H3 cell ({test_h3_index}) empty or no initial anchor. Searching k-rings for bedroom match.")
#         found_in_k_ring = False
#         for k_anchor_search in range(1, MAX_INITIAL_ANCHOR_SEARCH_K_RING + 1):
#             k_ring_indices = list(h3.k_ring(test_h3_index, k_anchor_search))
#             # Ensure we don't re-query the test_h3_index if it was already considered empty or unsuitable
#             k_ring_indices = [idx for idx in k_ring_indices if idx != test_h3_index]
#
#             listings_in_k_ring = df_global[df_global["h3_index"].isin(k_ring_indices)].copy()  # Use .copy()
#
#             if not listings_in_k_ring.empty:
#                 listings_in_k_ring["bedroom_diff"] = abs(listings_in_k_ring["bedrooms"] - test_bedrooms)
#                 listings_in_k_ring["distance_to_test"] = listings_in_k_ring.apply(
#                     lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#                 )
#                 best_k_ring_match = listings_in_k_ring.sort_values(["bedroom_diff", "distance_to_test"]).iloc[0]
#
#                 anchor_h3_index = best_k_ring_match["h3_index"]
#                 anchor_color_rank = int(best_k_ring_match["color_rank"]) if pd.notna(
#                     best_k_ring_match["color_rank"]) else None
#                 distance_to_anchor_point_meters = best_k_ring_match["distance_to_test"]
#                 anchor_found_method = f"K-ring {k_anchor_search} (bedroom match)"
#                 found_in_k_ring = True
#                 break
#
#         if not found_in_k_ring:
#             print(f"No feature-similar anchor found within {MAX_INITIAL_ANCHOR_SEARCH_K_RING} k-rings.")
#
#     # Strategy 3: Fallback to absolute geographically nearest if no anchor found yet
#     if anchor_h3_index is None:
#         print(f"No anchor from own H3 or k-ring bedroom search. Falling back to absolute nearest listing.")
#         # Create a temporary column for distance calculation if it doesn't exist or was on a copy
#         df_global_temp_dist = df_global.copy()  # Work on a copy to add distance column safely
#         df_global_temp_dist["distance_to_test_loc_fallback"] = df_global_temp_dist.apply(
#             lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#         )
#
#         if df_global_temp_dist.empty or "distance_to_test_loc_fallback" not in df_global_temp_dist.columns or \
#                 df_global_temp_dist["distance_to_test_loc_fallback"].empty:
#             print(
#                 "Error: Could not calculate distances or no entries after distance calculation in fallback for anchor.")
#             return None, None, float('inf')
#
#         nearest_entry = df_global_temp_dist.sort_values("distance_to_test_loc_fallback").iloc[0]
#         anchor_h3_index = nearest_entry["h3_index"]
#         anchor_color_rank = int(nearest_entry["color_rank"]) if pd.notna(nearest_entry["color_rank"]) else None
#         distance_to_anchor_point_meters = nearest_entry["distance_to_test_loc_fallback"]
#         anchor_found_method = "Absolute Nearest"
#
#     if anchor_h3_index is None or anchor_color_rank is None:  # Should be caught by earlier checks if df_global was empty
#         print("CRITICAL ERROR: Could not determine a valid anchor H3 index or color rank.")
#         return None, None, float('inf')
#
#     print(
#         f"Final Anchor Selected ({anchor_found_method}): H3 {anchor_h3_index}, Color Rank {anchor_color_rank}, Distance: {distance_to_anchor_point_meters:.2f}m")
#     return anchor_h3_index, anchor_color_rank, distance_to_anchor_point_meters
#
#
# def select_comparables_for_pricing(
#         df_global_processed: pd.DataFrame,
#         test_property_details: pd.DataFrame,
#         target_count: int = TARGET_COMPARABLES_COUNT,
# ) -> pd.DataFrame:
#     """
#     Selects comparable listings for a given test property.
#     """
#     if df_global_processed.empty:
#         print("Error: Global processed DataFrame is empty. Cannot select comparables.")
#         return pd.DataFrame()
#     if test_property_details.empty:
#         print("Error: Test property details are empty. Cannot select comparables.")
#         return pd.DataFrame()
#
#     # Ensure test_property_details has required columns
#     required_test_cols = ['lat', 'long', 'id']  # 'bedrooms' is optional but checked for
#     if not all(col in test_property_details.columns for col in required_test_cols):
#         missing_test_cols = [col for col in required_test_cols if col not in test_property_details.columns]
#         print(f"Error: test_property_details missing required columns: {missing_test_cols}")
#         return pd.DataFrame()
#
#     # Make a copy to avoid modifying the original global DataFrame
#     df_search_pool = df_global_processed.copy()
#
#     test_lat = test_property_details["lat"].iloc[0]
#     test_long = test_property_details["long"].iloc[0]
#     test_bedrooms = (
#         int(test_property_details["bedrooms"].iloc[0])
#         if "bedrooms" in test_property_details.columns and pd.notna(test_property_details["bedrooms"].iloc[0])
#         else None
#     )
#     test_property_id = test_property_details["id"].iloc[0]
#
#     anchor_h3_index, anchor_color_rank, _ = get_anchor_details(
#         df_search_pool, test_lat, test_long, test_bedrooms
#     )
#
#     if anchor_h3_index is None:
#         print("Error: Could not determine anchor H3 details for comparable selection. Returning empty DataFrame.")
#         return pd.DataFrame()
#
#     collected_listings_df = pd.DataFrame()
#
#     # Tier 1: Listings in the anchor H3 cell
#     print(f"\n--- Tier 1: Searching anchor H3 cell: {anchor_h3_index} ---")
#     listings_in_anchor_h3 = df_search_pool[df_search_pool["h3_index"] == anchor_h3_index].copy()
#
#     # Exclude the test property itself if it happens to be in the anchor H3 (e.g. reprocessing)
#     listings_in_anchor_h3 = listings_in_anchor_h3[listings_in_anchor_h3['id'] != test_property_id]
#
#     if not listings_in_anchor_h3.empty:
#         listings_in_anchor_h3["distance_to_test"] = listings_in_anchor_h3.apply(
#             lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#         )
#         sort_cols_tier1 = ["distance_to_test"]
#         ascending_tier1 = [True]
#
#         if test_bedrooms is not None and 'bedrooms' in listings_in_anchor_h3.columns:
#             listings_in_anchor_h3["bedroom_diff"] = abs(
#                 listings_in_anchor_h3["bedrooms"] - test_bedrooms
#             )
#             sort_cols_tier1.insert(0, "bedroom_diff")
#             ascending_tier1.insert(0, True)
#
#         listings_in_anchor_h3 = listings_in_anchor_h3.sort_values(
#             by=sort_cols_tier1, ascending=ascending_tier1
#         )
#         collected_listings_df = pd.concat([collected_listings_df, listings_in_anchor_h3])
#         collected_listings_df = collected_listings_df.drop_duplicates(subset=["id"])
#
#     print(f"Collected {len(collected_listings_df)} from anchor H3 cell (excluding test property if present).")
#
#     # Tier 2: Expand to k-rings if more listings are needed
#     k = 1
#     processed_h3_indices = {anchor_h3_index}
#
#     while len(collected_listings_df) < target_count and k <= MAX_K_RING_SEARCH:
#         print(
#             f"\n--- Tier 2: Collected {len(collected_listings_df)} listings. Need {target_count}. Searching k-ring: {k} ---")
#
#         current_ring_indices = set(h3.k_ring(anchor_h3_index, k))
#         new_indices_to_search = list(current_ring_indices - processed_h3_indices)
#
#         if not new_indices_to_search:
#             print(f"No new H3 indices to search in k-ring {k}.")
#             k += 1
#             processed_h3_indices.update(current_ring_indices)
#             continue
#
#         print(f"Searching {len(new_indices_to_search)} new H3 cells in k-ring {k}.")
#
#         potential_neighbors_df = df_search_pool[
#             df_search_pool["h3_index"].isin(new_indices_to_search)
#         ].copy()
#
#         # Exclude the test property itself and already collected listings
#         potential_neighbors_df = potential_neighbors_df[
#             ~potential_neighbors_df["id"].isin(collected_listings_df["id"]) &
#             (potential_neighbors_df["id"] != test_property_id)
#             ]
#
#         if not potential_neighbors_df.empty:
#             potential_neighbors_df["color_rank_diff"] = abs(
#                 potential_neighbors_df["color_rank"] - anchor_color_rank
#             )
#             potential_neighbors_df["distance_to_test"] = potential_neighbors_df.apply(
#                 lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
#             )
#
#             sort_cols_tier2 = ["color_rank_diff", "distance_to_test"]
#             ascending_tier2 = [True, True]
#
#             if test_bedrooms is not None and 'bedrooms' in potential_neighbors_df.columns:
#                 potential_neighbors_df["bedroom_diff"] = abs(
#                     potential_neighbors_df["bedrooms"] - test_bedrooms
#                 )
#                 sort_cols_tier2.insert(1, "bedroom_diff")
#                 ascending_tier2.insert(1, True)
#
#             potential_neighbors_df = potential_neighbors_df.sort_values(
#                 by=sort_cols_tier2, ascending=ascending_tier2
#             )
#
#             needed_now = target_count - len(collected_listings_df)
#             collected_listings_df = pd.concat(
#                 [collected_listings_df, potential_neighbors_df.head(needed_now)]
#             )
#             collected_listings_df = collected_listings_df.drop_duplicates(subset=["id"])
#             print(
#                 f"Added {len(potential_neighbors_df.head(needed_now))} listings from k-ring {k}. Total collected: {len(collected_listings_df)}")
#         else:
#             print(f"No new, uncollected listings found in H3 cells for k-ring {k}.")
#
#         processed_h3_indices.update(current_ring_indices)
#         if len(collected_listings_df) >= target_count:
#             break
#         k += 1
#
#     final_selected_listings = collected_listings_df.head(target_count)
#     print(f"\nFinal selected dataset size for pricing: {len(final_selected_listings)}")
#     if len(final_selected_listings) < target_count and target_count > 0:
#         print(
#             f"Warning: Could only retrieve {len(final_selected_listings)} listings, "
#             f"less than target {target_count}."
#         )
#     elif target_count == 0:
#         print("Warning: Target count was 0, no listings selected.")
#
#     cols_to_drop_from_final = ['distance_to_test', 'bedroom_diff', 'color_rank_diff', 'distance_to_test_loc_fallback']
#     final_selected_listings = final_selected_listings.drop(
#         columns=[col for col in cols_to_drop_from_final if col in final_selected_listings.columns],
#         errors='ignore'
#     )
#
#     return final_selected_listings

####################################################################################################


import pandas as pd
import h3
from geopy.distance import geodesic
from typing import Tuple, Optional, List, Dict

# --- Configuration Constants ---
H3_RESOLUTION = 8
TARGET_COMPARABLES_COUNT = 250
MAX_K_RING_SEARCH = 7
MAX_INITIAL_ANCHOR_SEARCH_K_RING = 2


def get_anchor_details(
        df_global: pd.DataFrame,
        test_lat: float,
        test_long: float,
        test_bedrooms: Optional[int]
) -> Tuple[Optional[str], Optional[int], float]:
    """
    Determines the anchor H3 index and color rank for the search.
    Priority:
    1. Listings in test property's own H3 cell, prioritizing bedroom similarity.
    2. If own H3 is empty or no good bedroom match, search immediate k-rings (up to MAX_INITIAL_ANCHOR_SEARCH_K_RING)
       for the listing with the smallest bedroom difference, then closest distance.
    3. If still no good anchor, fall back to the absolute geographically nearest listing.

    Returns:
        tuple: (anchor_h3_index, anchor_color_rank, distance_to_anchor_point_meters)
               Returns (None, None, float('inf')) if df_global is empty or critical columns missing.
    """
    if df_global.empty:
        print("Warning: Global dataframe is empty. Cannot determine anchor details.")
        return None, None, float('inf')

    required_cols = ['h3_index', 'color_rank', 'lat', 'long', 'bedrooms', 'id']
    if not all(col in df_global.columns for col in required_cols):
        missing = [col for col in required_cols if col not in df_global.columns]
        print(f"Error: df_global missing required columns for get_anchor_details: {missing}")
        return None, None, float('inf')

    test_h3_index = h3.geo_to_h3(test_lat, test_long, H3_RESOLUTION)

    anchor_h3_index: Optional[str] = None
    anchor_color_rank: Optional[int] = None
    distance_to_anchor_point_meters: float = float('inf')
    anchor_found_method = "None"

    # Strategy 1: Check test property's own H3 cell
    listings_in_test_h3 = df_global[df_global["h3_index"] == test_h3_index].copy()
    if not listings_in_test_h3.empty:
        if test_bedrooms is not None and 'bedrooms' in listings_in_test_h3.columns:
            # Ensure 'bedrooms' column is numeric for subtraction, handle potential NaNs
            listings_in_test_h3['bedrooms'] = pd.to_numeric(listings_in_test_h3['bedrooms'], errors='coerce')
            # Drop rows where 'bedrooms' became NaN after coercion if test_bedrooms is not None
            # Or, decide on a default value for bedroom_diff if a listing's bedrooms is NaN
            # For now, let's assume we only compare if listing_bedrooms is valid
            valid_bedroom_listings = listings_in_test_h3.dropna(subset=['bedrooms']).copy()
            if not valid_bedroom_listings.empty:
                valid_bedroom_listings["bedroom_diff"] = abs(valid_bedroom_listings["bedrooms"] - test_bedrooms)
                valid_bedroom_listings["distance_to_test"] = valid_bedroom_listings.apply(
                    lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
                )
                best_in_own_h3 = valid_bedroom_listings.sort_values(["bedroom_diff", "distance_to_test"]).iloc[0]
                # Define "good bedroom match" more explicitly, e.g., diff <=1 or <=2
                if best_in_own_h3["bedroom_diff"] <= 1:  # Stricter match for anchor
                    anchor_h3_index = best_in_own_h3["h3_index"]
                    anchor_color_rank = int(best_in_own_h3["color_rank"]) if pd.notna(
                        best_in_own_h3["color_rank"]) else None
                    distance_to_anchor_point_meters = best_in_own_h3["distance_to_test"]
                    anchor_found_method = "Own H3 (good bedroom match)"

        if anchor_h3_index is None:  # If no good bedroom match or test_bedrooms is None
            if "distance_to_test" not in listings_in_test_h3.columns:
                listings_in_test_h3["distance_to_test"] = listings_in_test_h3.apply(
                    lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
                )
            closest_in_own_h3 = listings_in_test_h3.sort_values("distance_to_test").iloc[0]
            anchor_h3_index = closest_in_own_h3["h3_index"]
            anchor_color_rank = int(closest_in_own_h3["color_rank"]) if pd.notna(
                closest_in_own_h3["color_rank"]) else None
            distance_to_anchor_point_meters = closest_in_own_h3["distance_to_test"]
            anchor_found_method = "Own H3 (closest)"

    # Strategy 2: If own H3 was empty or didn't yield a satisfactory anchor, and test_bedrooms is known
    if anchor_h3_index is None and test_bedrooms is not None:
        print(
            f"Test property's H3 cell ({test_h3_index}) empty or no initial anchor. Searching k-rings for bedroom match.")
        found_in_k_ring = False
        for k_anchor_search in range(1, MAX_INITIAL_ANCHOR_SEARCH_K_RING + 1):
            k_ring_indices = list(h3.k_ring(test_h3_index, k_anchor_search))
            k_ring_indices = [idx for idx in k_ring_indices if idx != test_h3_index]

            listings_in_k_ring = df_global[df_global["h3_index"].isin(k_ring_indices)].copy()

            if not listings_in_k_ring.empty and 'bedrooms' in listings_in_k_ring.columns:
                listings_in_k_ring['bedrooms'] = pd.to_numeric(listings_in_k_ring['bedrooms'], errors='coerce')
                valid_bedroom_listings_k_ring = listings_in_k_ring.dropna(subset=['bedrooms']).copy()
                if not valid_bedroom_listings_k_ring.empty:
                    valid_bedroom_listings_k_ring["bedroom_diff"] = abs(
                        valid_bedroom_listings_k_ring["bedrooms"] - test_bedrooms)
                    valid_bedroom_listings_k_ring["distance_to_test"] = valid_bedroom_listings_k_ring.apply(
                        lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
                    )
                    # Prioritize smaller bedroom_diff, then closer distance
                    best_k_ring_match = \
                    valid_bedroom_listings_k_ring.sort_values(["bedroom_diff", "distance_to_test"]).iloc[0]

                    anchor_h3_index = best_k_ring_match["h3_index"]
                    anchor_color_rank = int(best_k_ring_match["color_rank"]) if pd.notna(
                        best_k_ring_match["color_rank"]) else None
                    distance_to_anchor_point_meters = best_k_ring_match["distance_to_test"]
                    anchor_found_method = f"K-ring {k_anchor_search} (bedroom match)"
                    found_in_k_ring = True
                    break
        if not found_in_k_ring:
            print(f"No feature-similar anchor found within {MAX_INITIAL_ANCHOR_SEARCH_K_RING} k-rings.")

    # Strategy 3: Fallback to absolute geographically nearest if no anchor found yet
    if anchor_h3_index is None:
        print(f"No anchor from own H3 or k-ring bedroom search. Falling back to absolute nearest listing.")
        df_global_temp_dist = df_global.copy()
        df_global_temp_dist["distance_to_test_loc_fallback"] = df_global_temp_dist.apply(
            lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
        )

        if df_global_temp_dist.empty or "distance_to_test_loc_fallback" not in df_global_temp_dist.columns or \
                df_global_temp_dist["distance_to_test_loc_fallback"].empty:
            print(
                "Error: Could not calculate distances or no entries after distance calculation in fallback for anchor.")
            return None, None, float('inf')

        nearest_entry = df_global_temp_dist.sort_values("distance_to_test_loc_fallback").iloc[0]
        anchor_h3_index = nearest_entry["h3_index"]
        anchor_color_rank = int(nearest_entry["color_rank"]) if pd.notna(nearest_entry["color_rank"]) else None
        distance_to_anchor_point_meters = nearest_entry["distance_to_test_loc_fallback"]
        anchor_found_method = "Absolute Nearest"

    if anchor_h3_index is None or anchor_color_rank is None:
        print("CRITICAL ERROR: Could not determine a valid anchor H3 index or color rank.")
        return None, None, float('inf')

    print(
        f"Final Anchor Selected ({anchor_found_method}): H3 {anchor_h3_index}, Color Rank {anchor_color_rank}, Distance: {distance_to_anchor_point_meters:.2f}m")
    return anchor_h3_index, anchor_color_rank, distance_to_anchor_point_meters


def select_comparables_for_pricing(
        df_global_processed: pd.DataFrame,
        test_property_details: pd.DataFrame,
        target_count: int = TARGET_COMPARABLES_COUNT,
) -> pd.DataFrame:
    """
    Selects comparable listings for a given test property.
    Prioritizes H3 cells with higher color ranks within k-rings.
    """
    if df_global_processed.empty:
        print("Error: Global processed DataFrame is empty. Cannot select comparables.")
        return pd.DataFrame()
    if test_property_details.empty:
        print("Error: Test property details are empty. Cannot select comparables.")
        return pd.DataFrame()

    required_test_cols = ['lat', 'long', 'id']
    if not all(col in test_property_details.columns for col in required_test_cols):
        missing_test_cols = [col for col in required_test_cols if col not in test_property_details.columns]
        print(f"Error: test_property_details missing required columns: {missing_test_cols}")
        return pd.DataFrame()

    df_search_pool = df_global_processed.copy()
    # Ensure 'bedrooms' and 'color_rank' are numeric, coercing errors to NaN
    if 'bedrooms' in df_search_pool.columns:
        df_search_pool['bedrooms'] = pd.to_numeric(df_search_pool['bedrooms'], errors='coerce')
    if 'color_rank' in df_search_pool.columns:
        df_search_pool['color_rank'] = pd.to_numeric(df_search_pool['color_rank'], errors='coerce')

    test_lat = test_property_details["lat"].iloc[0]
    test_long = test_property_details["long"].iloc[0]
    test_bedrooms = (
        int(test_property_details["bedrooms"].iloc[0])
        if "bedrooms" in test_property_details.columns and pd.notna(test_property_details["bedrooms"].iloc[0])
        else None
    )
    test_property_id = test_property_details["id"].iloc[0]

    anchor_h3_index, anchor_color_rank, _ = get_anchor_details(
        df_search_pool, test_lat, test_long, test_bedrooms
    )

    if anchor_h3_index is None or anchor_color_rank is None:  # anchor_color_rank can be None if no listing in anchor H3 has it
        print("Error: Could not determine valid anchor H3 details for comparable selection. Returning empty DataFrame.")
        return pd.DataFrame()

    collected_listings_df = pd.DataFrame()

    # Tier 1: Listings in the anchor H3 cell
    print(f"\n--- Tier 1: Searching anchor H3 cell: {anchor_h3_index} ---")
    listings_in_anchor_h3 = df_search_pool[df_search_pool["h3_index"] == anchor_h3_index].copy()
    listings_in_anchor_h3 = listings_in_anchor_h3[listings_in_anchor_h3['id'] != test_property_id]

    if not listings_in_anchor_h3.empty:
        listings_in_anchor_h3["distance_to_test"] = listings_in_anchor_h3.apply(
            lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
        )
        sort_cols_tier1 = ["distance_to_test"]
        ascending_tier1 = [True]

        if test_bedrooms is not None and 'bedrooms' in listings_in_anchor_h3.columns:
            # Only consider listings with valid bedroom counts for diff calculation
            valid_bedroom_listings = listings_in_anchor_h3.dropna(subset=['bedrooms']).copy()
            if not valid_bedroom_listings.empty:
                valid_bedroom_listings["bedroom_diff"] = abs(
                    valid_bedroom_listings["bedrooms"] - test_bedrooms
                )
                # Merge bedroom_diff back or sort separately
                listings_in_anchor_h3 = pd.merge(listings_in_anchor_h3, valid_bedroom_listings[['id', 'bedroom_diff']],
                                                 on='id', how='left')
                sort_cols_tier1.insert(0, "bedroom_diff")
                ascending_tier1.insert(0, True)
            else:  # No listings with valid bedrooms, sort by distance only
                pass

        listings_in_anchor_h3 = listings_in_anchor_h3.sort_values(
            by=sort_cols_tier1, ascending=ascending_tier1, na_position='last'  # Put NaNs in bedroom_diff last
        )
        collected_listings_df = pd.concat([collected_listings_df, listings_in_anchor_h3])
        collected_listings_df = collected_listings_df.drop_duplicates(subset=["id"])

    print(f"Collected {len(collected_listings_df)} from anchor H3 cell (excluding test property if present).")

    # Tier 2: Expand to k-rings if more listings are needed
    k = 1
    processed_h3_indices = {anchor_h3_index}

    while len(collected_listings_df) < target_count and k <= MAX_K_RING_SEARCH:
        print(
            f"\n--- Tier 2: Collected {len(collected_listings_df)} listings. Need {target_count}. Searching k-ring: {k} ---")

        current_ring_indices = set(h3.k_ring(anchor_h3_index, k))
        new_indices_to_search = list(current_ring_indices - processed_h3_indices)

        if not new_indices_to_search:
            print(f"No new H3 indices to search in k-ring {k}.")
            k += 1
            processed_h3_indices.update(current_ring_indices)
            continue

        print(f"Evaluating {len(new_indices_to_search)} new H3 cells in k-ring {k} for color rank quality.")

        # --- New Logic: Prioritize H3 cells by their internal max color_rank ---
        h3_cell_quality = []
        for h3_idx in new_indices_to_search:
            listings_in_cell = df_search_pool[
                (df_search_pool["h3_index"] == h3_idx) &
                (~df_search_pool["id"].isin(collected_listings_df["id"])) &  # Avoid already collected
                (df_search_pool["id"] != test_property_id)  # Avoid test property
                ]
            if not listings_in_cell.empty and 'color_rank' in listings_in_cell.columns:
                # Use median or mean for a more robust measure if max is too sensitive
                # For now, using max as per "highest possible ranking"
                valid_ranks = listings_in_cell['color_rank'].dropna()
                if not valid_ranks.empty:
                    max_rank_in_cell = valid_ranks.max()
                    # Count how many listings are in this cell to break ties (more listings in high-rank cell is better)
                    num_listings_in_cell = len(listings_in_cell)
                    h3_cell_quality.append(
                        {"h3_index": h3_idx, "max_rank": max_rank_in_cell, "num_listings": num_listings_in_cell})
                else:  # Cell has listings but no valid color_ranks
                    h3_cell_quality.append({"h3_index": h3_idx, "max_rank": -1, "num_listings": 0})  # Low priority
            else:  # Cell is empty or no color_rank column
                h3_cell_quality.append({"h3_index": h3_idx, "max_rank": -1, "num_listings": 0})  # Low priority

        # Sort H3 cells: primary by max_rank (desc), secondary by num_listings (desc)
        sorted_h3_cells_by_quality = sorted(h3_cell_quality, key=lambda x: (x['max_rank'], x['num_listings']),
                                            reverse=True)

        num_added_this_k_ring = 0
        for cell_info in sorted_h3_cells_by_quality:
            if len(collected_listings_df) >= target_count:
                break

            current_h3_to_process = cell_info['h3_index']
            # print(f"  Processing H3 cell: {current_h3_to_process} (Max Rank: {cell_info['max_rank']}, Listings: {cell_info['num_listings']})")

            potential_neighbors_df = df_search_pool[
                (df_search_pool["h3_index"] == current_h3_to_process) &
                (~df_search_pool["id"].isin(collected_listings_df["id"])) &
                (df_search_pool["id"] != test_property_id)
                ].copy()

            if not potential_neighbors_df.empty:
                potential_neighbors_df["distance_to_test"] = potential_neighbors_df.apply(
                    lambda row: geodesic((test_lat, test_long), (row["lat"], row["long"])).meters, axis=1
                )

                # Sort within this prioritized H3 cell
                sort_cols_tier2 = ["distance_to_test"]
                ascending_tier2 = [True]

                if test_bedrooms is not None and 'bedrooms' in potential_neighbors_df.columns:
                    # Only consider listings with valid bedroom counts for diff calculation
                    valid_bedroom_listings_k = potential_neighbors_df.dropna(subset=['bedrooms']).copy()
                    if not valid_bedroom_listings_k.empty:
                        valid_bedroom_listings_k["bedroom_diff"] = abs(
                            valid_bedroom_listings_k["bedrooms"] - test_bedrooms
                        )
                        potential_neighbors_df = pd.merge(potential_neighbors_df,
                                                          valid_bedroom_listings_k[['id', 'bedroom_diff']], on='id',
                                                          how='left')

                        sort_cols_tier2.insert(0, "bedroom_diff")
                        ascending_tier2.insert(0, True)

                # Also consider sorting by the listing's own color_rank (desc) as a secondary factor within the cell
                if 'color_rank' in potential_neighbors_df.columns:
                    sort_cols_tier2.insert(len(sort_cols_tier2)-1, "color_rank") # Before distance
                    ascending_tier2.insert(len(ascending_tier2)-1, False) # Descending for color_rank

                potential_neighbors_df = potential_neighbors_df.sort_values(
                    by=sort_cols_tier2, ascending=ascending_tier2, na_position='last'
                )

                needed_now = target_count - len(collected_listings_df)
                listings_to_add = potential_neighbors_df.head(needed_now)
                collected_listings_df = pd.concat([collected_listings_df, listings_to_add])
                # No need to drop_duplicates here if filtering by ID from collected_listings_df is robust
                num_added_this_k_ring += len(listings_to_add)

        print(f"Added {num_added_this_k_ring} listings from k-ring {k}. Total collected: {len(collected_listings_df)}")
        # --- End of New Logic ---

        processed_h3_indices.update(current_ring_indices)  # Mark all in ring as processed
        if len(collected_listings_df) >= target_count:
            break
        k += 1

    final_selected_listings = collected_listings_df.head(target_count)
    print(f"\nFinal selected dataset size for pricing: {len(final_selected_listings)}")
    if len(final_selected_listings) < target_count and target_count > 0:
        print(
            f"Warning: Could only retrieve {len(final_selected_listings)} listings, "
            f"less than target {target_count}."
        )
    elif target_count == 0:
        print("Warning: Target count was 0, no listings selected.")

    cols_to_drop_from_final = ['distance_to_test', 'bedroom_diff', 'color_rank_diff', 'distance_to_test_loc_fallback']
    final_selected_listings = final_selected_listings.drop(
        columns=[col for col in cols_to_drop_from_final if col in final_selected_listings.columns],
        errors='ignore'
    )

    return final_selected_listings


