#
# import pandas as pd
# import json
# from datetime import datetime
# import re
# from collections import defaultdict
# from filter200 import process_dataframe
#
# import mysql.connector
# import json
#
# def get_bin(days):
#     if days == 1:
#         return '1_day_stays'
#     elif days == 2:
#         return '2_day_stays'
#     elif 3 <= days <= 4:
#         return '3_4_day_stays'
#     elif 5 <= days <= 6:
#         return '5_6_day_stays'
#     elif 7 <= days <= 14:
#         return '7_14_day_stays'
#     elif 15 <= days <= 28:
#         return '15_28_day_stays'
#     else:
#         return '28+_day_stays'
#
#
# def generate_recommendations(df, test_df):
#     # Special parsing function for DatetimeIndex strings
#     def parse_datetime_index(date_str):
#         try:
#             if pd.isna(date_str) or date_str == '[]' or date_str == 'ns':
#                 return []
#
#             # Check if it's a DatetimeIndex string representation
#             if isinstance(date_str, str) and date_str.startswith('DatetimeIndex(['):
#                 # Extract dates from the DatetimeIndex string format
#                 date_pattern = r"'(\d{4}-\d{2}-\d{2})'"
#                 dates = re.findall(date_pattern, date_str)
#                 return [pd.to_datetime(d) for d in dates]
#
#             # Handle regular list format (for completeness)
#             elif isinstance(date_str, str):
#                 try:
#                     date_list = eval(date_str)
#                     if isinstance(date_list, list):
#                         return [pd.to_datetime(d) for d in date_list if d != 'ns']
#                     return []
#                 except:
#                     return []
#
#             # Handle case where it's already a list
#             elif isinstance(date_str, list):
#                 return [pd.to_datetime(d) for d in date_str if d != 'ns']
#
#             return []
#         except Exception as e:
#             print(f"Error processing: {str(e)}")
#             return []
#
#     # Apply the specialized parsing function
#     print("Converting date strings to datetime objects...")
#     df['available_dates'] = df['available_dates'].apply(parse_datetime_index)
#
#     # Print some statistics about the parsed dates
#     dates_count = df['available_dates'].apply(len)
#     print(f"Date counts statistics: Min={dates_count.min()}, Max={dates_count.max()}, Mean={dates_count.mean():.1f}")
#     print(f"Number of listings with dates: {(dates_count > 0).sum()} out of {len(df)}")
#
#     # Calculate global date range
#     all_dates = []
#     for dates in df['available_dates']:
#         all_dates.extend([d for d in dates if not pd.isna(d)])
#
#     if not all_dates:
#         print("No valid dates found in the dataset!")
#         return json.dumps({}, indent=2)
#
#     min_date, max_date = min(all_dates), max(all_dates)
#     print(f"Date range: {min_date.date()} to {max_date.date()}")
#
#     # Initialize structure to hold per-month stay counts
#     month_bins = defaultdict(lambda: defaultdict(int))
#
#     # Process each listing's available dates
#     processed_count = 0
#     for dates in df['available_dates']:
#         if not dates:  # Check if list is empty
#             continue
#
#         dates = sorted(dates)
#         gaps = []
#
#         # Pre gap (before first available date)
#         first_date = dates[0]
#         if first_date > min_date:
#             pre_start = min_date
#             pre_end = first_date - pd.Timedelta(days=1)
#             gaps.append((pre_start, pre_end))
#
#         # Between gaps
#         for i in range(1, len(dates)):
#             prev_date = dates[i - 1]
#             current_date = dates[i]
#             if prev_date + pd.Timedelta(days=1) < current_date:
#                 gap_start = prev_date + pd.Timedelta(days=1)
#                 gap_end = current_date - pd.Timedelta(days=1)
#                 gaps.append((gap_start, gap_end))
#
#         # Post gap (after last available date)
#         last_date = dates[-1]
#         if last_date < max_date:
#             post_start = last_date + pd.Timedelta(days=1)
#             post_end = max_date
#             gaps.append((post_start, post_end))
#
#         # Process gaps by month
#         for gap_start, gap_end in gaps:
#             current_start = gap_start
#             while current_start <= gap_end:
#                 next_month = current_start + pd.offsets.MonthBegin(1)
#                 month_end = min(gap_end, next_month - pd.Timedelta(days=1))
#                 days_in_gap = (month_end - current_start).days + 1
#
#                 bin_name = get_bin(days_in_gap)
#                 month_year = current_start.strftime('%b_%Y')
#                 month_bins[month_year][bin_name] += 1
#
#                 current_start = month_end + pd.Timedelta(days=1)
#
#         processed_count += 1
#
#     print(f"Processed {processed_count} listings with available dates")
#
#     # Create booking trends DataFrame
#     booking_trends = pd.DataFrame.from_dict(month_bins, orient='index').fillna(0).T
#     sorted_months = []
#     if not booking_trends.empty and booking_trends.columns.any():
#         sorted_months = sorted(booking_trends.columns,
#                                key=lambda x: datetime.strptime(x, "%b_%Y"))
#         booking_trends = booking_trends[sorted_months]
#
#     print(f"Generated trends for {len(sorted_months)} months")
#
#     # Generate recommendations
#     recommendations = {}
#     if sorted_months:
#         # Safely get current and next occupancy with fallbacks
#         try:
#             current_occ = test_df.get(sorted_months[0], pd.Series([100])).iloc[0]
#             print(f"Current month occupancy ({sorted_months[0]}): {current_occ}")
#         except (KeyError, IndexError, AttributeError) as e:
#             current_occ = 100
#
#         try:
#             next_occ = test_df.get(sorted_months[1], pd.Series([100])).iloc[0] if len(sorted_months) > 1 else 100
#             if len(sorted_months) > 1:
#                 print(f"Next month occupancy ({sorted_months[1]}): {next_occ}")
#         except (KeyError, IndexError, AttributeError) as e:
#             next_occ = 100
#
#         low_occupancy = current_occ < 40 or next_occ < 40
#         print(f"Low occupancy detected: {low_occupancy}")
#     else:
#         low_occupancy = False
#
#     for idx, month in enumerate(sorted_months):
#         if low_occupancy and idx < 2:
#             rec = {'weekday_min_stay': 1, 'weekend_min_stay': 1}
#         else:
#             if month in booking_trends:
#                 top_bins = booking_trends[month].nlargest(3)
#                 min_stay = 28
#                 stays = []
#                 for bin_name in top_bins.index:
#                     match = re.findall(r'\d+', bin_name.split('_')[0])
#                     if match:
#                         stays.append(int(match[0]))
#                 if stays:
#                     min_stay = min(stays)
#                 rec = {'weekday_min_stay': min_stay, 'weekend_min_stay': min_stay}
#             else:
#                 rec = {'weekday_min_stay': 1, 'weekend_min_stay': 1}
#         recommendations[month] = rec
#
#     print(f"Generated recommendations for {len(recommendations)} months")
#     return json.dumps(recommendations, indent=2)
#
#
#
#
#
# if __name__ == "__main__":
#     # Database Configuration
#     db_config = {
#         'host': '127.0.0.1',
#         'user': 'root',
#         'password': '',  # Add your password
#         'database': 'pricing'
#     }
#
#     # Default coordinates for process_dataframe
#     default_lat = 42.1713
#     default_lon = -73.9698
#
#     df_main_raw = pd.DataFrame()
#     df_test_single_row = pd.DataFrame()
#
#     try:
#         # 1. Fetch all data from rental_properties
#         print("Connecting to database for main rental properties data...")
#         conn = mysql.connector.connect(**db_config)
#         query_main = "SELECT * FROM rental_properties"
#         df_main_raw = pd.read_sql_query(query_main, conn)
#         print(f"Fetched {len(df_main_raw)} rows from rental_properties.")
#
#         if df_main_raw.empty:
#             print("Main rental_properties data is empty. Exiting.")
#             exit()
#
#         # 2. Process the main dataframe
#         print(f"Processing main dataframe with coordinates: lat={default_lat}, long={default_lon}")
#         # Ensure process_dataframe handles potential missing columns gracefully or select specific ones
#         df_processed_main = process_dataframe(df_main_raw.copy(), lat=default_lat, long=default_lon)
#         print(f"Main dataframe processed. Shape: {df_processed_main.shape}")
#
#         if df_processed_main.empty:
#             print("Processed main dataframe is empty. Exiting.")
#             exit()
#
#         # 3. Fetch a single row from userdynamiclistset for the test dataframe
#         print("Connecting to database for test property data (userdynamiclistset)...")
#         # Re-establish connection or use existing if still valid and no cursors are open
#         if not conn.is_connected():
#             conn = mysql.connector.connect(**db_config)
#
#         cursor = conn.cursor(dictionary=True)  # Fetch as dictionary
#         # Modify this query if you need a specific row, e.g., by ID
#         query_test = "SELECT * FROM userdynamiclistset ORDER BY id LIMIT 1"  # Example: get the first row by id
#         cursor.execute(query_test)
#         test_property_data_dict = cursor.fetchone()
#         cursor.close()
#
#         if test_property_data_dict:
#             # Convert the single row dictionary to a DataFrame
#             df_test_single_row = pd.DataFrame([test_property_data_dict])
#             print(f"Fetched test property data. Columns: {df_test_single_row.columns.tolist()}")
#         else:
#             print("No data found in userdynamiclistset to form the test DataFrame. Exiting.")
#             exit()
#
#         # 4. Call get_recommended_price
#         if not df_processed_main.empty and not df_test_single_row.empty:
#             print("\nCalling get_recommended_price...")
#             recommended_price = generate_recommendations(df_processed_main, df_test_single_row)
#             print(f"\n------------------------------------------")
#             print(f"FINAL RECOMMENDED PRICE: {recommended_price:.2f}")
#             print(f"------------------------------------------")
#         else:
#             print("Cannot calculate recommended price due to empty processed main or test dataframe.")
#
#     except mysql.connector.Error as err:
#         print(f"Database Error: {err}")
#     except FileNotFoundError:
#         print("Error: filter200.py not found. Make sure it's in the correct path.")
#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")
#         import traceback
#
#         traceback.print_exc()
#     finally:
#         if 'conn' in locals() and conn.is_connected():
#             conn.close()
#             print("Database connection closed.")



#################################################################################


#
# import pandas as pd
# import json
# from datetime import datetime, timedelta  # timedelta might not be needed here but good for date ops
# import re
# from collections import defaultdict
# from filter200 import process_dataframe  # Assuming filter200.py is in the same directory or PYTHONPATH
#
# import mysql.connector
# import ast  # For safely evaluating string literals like "['date1', 'date2']"
# from calendar import monthrange  # To get number of days in a month
#
#
# # Helper function to categorize stay durations (remains the same)
# def get_bin(days):
#     if days == 1:
#         return '1_day_stays'
#     elif days == 2:
#         return '2_day_stays'
#     elif 3 <= days <= 4:
#         return '3_4_day_stays'
#     elif 5 <= days <= 6:
#         return '5_6_day_stays'
#     elif 7 <= days <= 14:
#         return '7_14_day_stays'
#     elif 15 <= days <= 28:
#         return '15_28_day_stays'
#     else:
#         return '28+_day_stays'
#
#
# # Enhanced parsing function for date strings/lists
# # Enhanced parsing function for date strings/lists
# def parse_available_dates_column(date_input):
#     try:
#         if pd.isna(date_input) or str(date_input).lower() in ['[]', '{}', 'nan', 'nat', 'ns']:
#             return []
#
#         # 1. If already a list (e.g., if DataFrame column was pre-processed)
#         if isinstance(date_input, list):
#             # Ensure items in the list are valid for pd.to_datetime
#             valid_items = [d for d in date_input if
#                            isinstance(d, (str, datetime, pd.Timestamp)) and str(d).lower() != 'ns']
#             parsed_dates = [pd.to_datetime(d, errors='coerce') for d in valid_items]
#             return [d for d in parsed_dates if pd.notna(d)]
#
#         # 2. If it's a string, try various parsing methods
#         if isinstance(date_input, str):
#             date_str = date_input.strip()  # General strip
#             if not date_str: return []
#
#             # A. DatetimeIndex string representation
#             if date_str.startswith('DatetimeIndex(['):
#                 date_pattern = r"'(\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?)?)'"
#                 extracted_dates = re.findall(date_pattern, date_str)
#                 parsed_dates = [pd.to_datetime(d, errors='coerce') for d in extracted_dates]
#                 return [d for d in parsed_dates if pd.notna(d)]
#
#             # B. String representation of a list (e.g., "['2023-01-01', '2023-01-02']")
#             #    Clean up common issues like trailing semicolons and newlines before ast.literal_eval
#             if (date_str.startswith('[') and date_str.endswith(']')) or \
#                     (date_str.startswith('(') and date_str.endswith(')')) or \
#                     (date_str.startswith('[') and date_str.endswith('];')):  # Check for trailing semicolon specifically
#
#                 # Remove trailing semicolon if present
#                 if date_str.endswith(';'):
#                     date_str = date_str[:-1].strip()
#
#                 # Replace newlines and multiple spaces within the string
#                 # This helps if the string is formatted like the error output
#                 date_str = re.sub(r'\s*\n\s*', '', date_str)  # Remove newlines and surrounding spaces
#                 date_str = re.sub(r'\s+', ' ', date_str)  # Condense multiple spaces to one
#                 # Ensure commas are followed by a space if they aren't, for cleaner ast.literal_eval
#                 date_str = date_str.replace('","', '", "')
#
#                 try:
#                     evaluated_data = ast.literal_eval(date_str)
#                     if isinstance(evaluated_data, (list, tuple)):
#                         valid_items = [d for d in evaluated_data if isinstance(d, str) and d.lower() != 'ns']
#                         parsed_dates = [pd.to_datetime(d, errors='coerce') for d in valid_items]
#                         return [d for d in parsed_dates if pd.notna(d)]
#                 except (ValueError, SyntaxError, TypeError) as e_ast:
#                     print(f"ast.literal_eval failed for '{date_str}' after cleaning: {e_ast}")
#                     # Fall through to C if ast.literal_eval still fails
#
#             # C. Single date string (less likely for 'available_dates' but good fallback)
#             try:
#                 single_date = pd.to_datetime(date_str, errors='coerce')
#                 if pd.notna(single_date):
#                     return [single_date]
#             except Exception:
#                 pass
#
#         print(
#             f"Warning: Could not parse date input: '{str(date_input)[:200]}...' of type {type(date_input)}")  # Print only a snippet
#         return []
#     except Exception as e:
#         print(f"Unexpected error in parse_available_dates_column for input '{str(date_input)[:200]}...': {e}")
#         return []
#
#
# # Helper function to calculate monthly occupancy for a single property
# def calculate_monthly_occupancy_for_property(property_available_dates_list, relevant_month_year_keys):
#     """
#     Calculates monthly occupancy percentages for a single property.
#
#     Args:
#         property_available_dates_list (list): List of pd.Timestamp objects when the property is available.
#         relevant_month_year_keys (list): List of strings like "Jan_2024", "Feb_2024".
#
#     Returns:
#         dict: Keys are month_year_keys, values are occupancy percentages.
#     """
#     occupancy_data = {}
#     if not property_available_dates_list:
#         # If no availability data, we can't calculate occupancy.
#         # The main function will default to 100% if this dict is empty for a month.
#         print("Property has no available_dates provided; cannot calculate its specific occupancy.")
#         return {}
#
#     # Normalize available dates to date objects for easier comparison
#     available_dates_set = set(d.normalize() for d in property_available_dates_list if pd.notna(d))
#
#     for month_year_key in relevant_month_year_keys:
#         try:
#             month_dt = datetime.strptime(month_year_key, "%b_%Y")
#             year, month = month_dt.year, month_dt.month
#         except ValueError:
#             print(f"Warning: Could not parse month_year_key for occupancy calculation: {month_year_key}")
#             continue
#
#         days_in_month = monthrange(year, month)[1]
#         available_count_in_month = 0
#         for day_num in range(1, days_in_month + 1):
#             current_day = pd.Timestamp(year, month, day_num).normalize()
#             if current_day in available_dates_set:
#                 available_count_in_month += 1
#
#         occupied_in_this_month = days_in_month - available_count_in_month
#         occupancy_percentage = (occupied_in_this_month / days_in_month) * 100 if days_in_month > 0 else 0
#         occupancy_data[month_year_key] = occupancy_percentage
#
#     return occupancy_data
#
#
# # Main logic function to generate stay recommendations
# def generate_recommendations(df, test_df):
#     # Apply the enhanced parsing function to the 'available_dates' column of the main df
#     print("Converting 'available_dates' strings to datetime objects for main DataFrame...")
#     if 'available_dates' not in df.columns:
#         print("Error: 'available_dates' column not found in the main processed dataframe.")
#         return json.dumps({"error": "Missing 'available_dates' column in main df"}, indent=2)
#     df['available_dates'] = df['available_dates'].apply(parse_available_dates_column)
#
#     dates_count = df['available_dates'].apply(len)
#     print(
#         f"Main df date counts stats: Min={dates_count.min() if not dates_count.empty else 0}, Max={dates_count.max() if not dates_count.empty else 0}, Mean={dates_count.mean() if not dates_count.empty else 0:.1f}")
#     print(f"Main df num listings with dates: {(dates_count > 0).sum()} out of {len(df)}")
#
#     all_dates = [d for dates_list in df['available_dates'] for d in dates_list if pd.notna(d)]
#     if not all_dates:
#         print("No valid dates found in the main dataset's 'available_dates' column!")
#         return json.dumps({"message": "No availability data in main df to process"}, indent=2)
#
#     min_date, max_date = min(all_dates), max(all_dates)
#     print(f"Overall date range from main df: {min_date.date()} to {max_date.date()}")
#
#     month_bins = defaultdict(lambda: defaultdict(int))
#     processed_count = 0
#     # ... (gap calculation logic for main_df remains the same) ...
#     for dates in df['available_dates']:
#         if not dates: continue
#         dates = sorted(list(set(d for d in dates if pd.notna(d))))
#         if not dates: continue
#         gaps = []
#         first_listing_date = dates[0]
#         if first_listing_date > min_date:
#             pre_start = min_date
#             pre_end = first_listing_date - pd.Timedelta(days=1)
#             if pre_start <= pre_end: gaps.append((pre_start, pre_end))
#         for i in range(len(dates) - 1):
#             prev_date = dates[i]
#             current_date = dates[i + 1]
#             if (current_date - prev_date).days > 1:
#                 gap_start = prev_date + pd.Timedelta(days=1)
#                 gap_end = current_date - pd.Timedelta(days=1)
#                 gaps.append((gap_start, gap_end))
#         last_listing_date = dates[-1]
#         if last_listing_date < max_date:
#             post_start = last_listing_date + pd.Timedelta(days=1)
#             post_end = max_date
#             if post_start <= post_end: gaps.append((post_start, post_end))
#         for gap_start, gap_end in gaps:
#             current_month_start = gap_start
#             while current_month_start <= gap_end:
#                 next_month_first_day = (current_month_start.replace(day=1) + pd.DateOffset(months=1))
#                 current_month_actual_end = min(gap_end, next_month_first_day - pd.Timedelta(days=1))
#                 days_in_segment = (current_month_actual_end - current_month_start).days + 1
#                 bin_name = get_bin(days_in_segment)
#                 month_year_key = current_month_start.strftime('%b_%Y')
#                 month_bins[month_year_key][bin_name] += 1
#                 current_month_start = current_month_actual_end + pd.Timedelta(days=1)
#         processed_count += 1
#     print(f"Processed {processed_count} listings from main_df for gap analysis.")
#
#     booking_trends = pd.DataFrame.from_dict(month_bins, orient='index').fillna(0)
#     sorted_months_from_main_df = []  # This is the timeline for recommendations
#     if not booking_trends.empty:
#         try:
#             sorted_months_from_main_df = sorted(booking_trends.index, key=lambda x: datetime.strptime(x, "%b_%Y"))
#             booking_trends = booking_trends.reindex(sorted_months_from_main_df).T  # Transpose after sorting index
#         except ValueError as ve:
#             print(f"Warning: Month sorting error for main_df trends: {ve}. Using unsorted.")
#             sorted_months_from_main_df = booking_trends.index.tolist()
#             booking_trends = booking_trends.T
#     print(f"Generated gap trends for {len(sorted_months_from_main_df)} months (from main_df).")
#
#     # --- Occupancy for test_df ---
#     test_property_occupancies = {}  # Will store all available occupancies for test_df
#
#     if not test_df.empty:
#         # Option 1: Try to get direct occupancy columns from test_df
#         # We need to know which months test_df *could* have occupancy for.
#         # We can infer this if it has 'available_dates' or by checking all sorted_months_from_main_df
#
#         potential_test_df_months = []
#         # Check for direct occupancy columns first across all months derived from main_df
#         for month_key in sorted_months_from_main_df:
#             if month_key in test_df.columns and pd.api.types.is_numeric_dtype(test_df[month_key]):
#                 test_property_occupancies[month_key] = test_df[month_key].iloc[0]
#                 potential_test_df_months.append(month_key)
#
#         if not potential_test_df_months and 'available_dates' in test_df.columns:  # If no direct cols, try calculating
#             print("Test_df lacks direct occupancy columns. Calculating from its 'available_dates'.")
#             test_av_dates_raw = test_df['available_dates'].iloc[0]
#             test_av_dates_parsed = parse_available_dates_column(test_av_dates_raw)
#
#             if test_av_dates_parsed:
#                 print(f"Parsed {len(test_av_dates_parsed)} available dates for the test property.")
#                 # Calculate for all months relevant to main_df, then we'll pick the first available ones
#                 calculated_occupancies = calculate_monthly_occupancy_for_property(
#                     test_av_dates_parsed,
#                     sorted_months_from_main_df  # Calculate for the whole range
#                 )
#                 test_property_occupancies.update(calculated_occupancies)  # Add calculated ones
#                 if calculated_occupancies:
#                     print(f"Calculated occupancies for test property (sample): "
#                           f"{ {k: v for i, (k, v) in enumerate(calculated_occupancies.items()) if i < 3} }")
#             else:
#                 print("Could not parse 'available_dates' for test_df or no dates found after parsing.")
#         elif not potential_test_df_months:
#             print("Test_df lacks direct occupancy columns and no 'available_dates' column found to calculate from.")
#
#     # --- Determine Low Occupancy based on test_df's actual available data ---
#     low_occupancy_triggered = False
#     low_occupancy_override_months = []  # Store the specific months to apply override
#
#     if test_property_occupancies:
#         # Get months from test_property_occupancies, sort them, and pick the first two
#         # These months must also be in sorted_months_from_main_df to be relevant
#
#         # Filter test_property_occupancies keys to only those present in sorted_months_from_main_df
#         # and then sort them chronologically.
#         test_df_actual_data_months_sorted = sorted(
#             [m for m in test_property_occupancies.keys() if m in sorted_months_from_main_df],
#             key=lambda x: datetime.strptime(x, "%b_%Y")
#         )
#
#         if test_df_actual_data_months_sorted:
#             test_df_current_month_key = test_df_actual_data_months_sorted[0]
#             test_df_current_occ = test_property_occupancies.get(test_df_current_month_key, 100)
#             print(
#                 f"Test_df's first available data month: {test_df_current_month_key}, Occupancy: {test_df_current_occ:.2f}%")
#
#             test_df_next_occ = 100
#             if len(test_df_actual_data_months_sorted) > 1:
#                 test_df_next_month_key = test_df_actual_data_months_sorted[1]
#                 test_df_next_occ = test_property_occupancies.get(test_df_next_month_key, 100)
#                 print(
#                     f"Test_df's second available data month: {test_df_next_month_key}, Occupancy: {test_df_next_occ:.2f}%")
#
#                 if test_df_current_occ < 40 or test_df_next_occ < 40:
#                     low_occupancy_triggered = True
#                     low_occupancy_override_months.append(test_df_current_month_key)
#                     low_occupancy_override_months.append(test_df_next_month_key)
#             elif test_df_current_occ < 40:  # Only one month of data available for test_df
#                 low_occupancy_triggered = True
#                 low_occupancy_override_months.append(test_df_current_month_key)
#
#             print(f"Low occupancy trigger based on test_df's data (<40%): {low_occupancy_triggered}")
#             if low_occupancy_triggered:
#                 print(f"Months targeted for override due to low occupancy: {low_occupancy_override_months}")
#         else:
#             print("No common months with occupancy data found between test_df and main_df's timeline.")
#     else:
#         print("No occupancy data available for test_df to check low occupancy.")
#
#     # --- Generate recommendations based on the main_df timeline ---
#     recommendations = {}
#     if not sorted_months_from_main_df:
#         print("No sorted months from main_df; cannot generate recommendations.")
#         return json.dumps({"message": "No timeline for recommendations"}, indent=2)
#
#     for month_year_key in sorted_months_from_main_df:
#         # Apply override if this month is one of the low_occupancy_override_months
#         if low_occupancy_triggered and month_year_key in low_occupancy_override_months:
#             rec = {'weekday_min_stay': 1, 'weekend_min_stay': 1}
#             print(f"Applying low occupancy override (1-day min stay) for {month_year_key}")
#         else:
#             # Default recommendation based on main_df's booking_trends (gap analysis)
#             if month_year_key in booking_trends.columns and not booking_trends[month_year_key].empty:
#                 top_bins = booking_trends[month_year_key].nlargest(3)
#                 min_stay_recommendation = 28
#                 stays_from_bins = []
#                 for bin_name_of_gap in top_bins.index:
#                     match = re.findall(r'\d+', bin_name_of_gap)
#                     if match: stays_from_bins.append(int(match[0]))
#                 if stays_from_bins: min_stay_recommendation = min(stays_from_bins)
#                 rec = {'weekday_min_stay': min_stay_recommendation, 'weekend_min_stay': min_stay_recommendation}
#             else:
#                 # Fallback if no trend data for the month in main_df
#                 rec = {'weekday_min_stay': 1, 'weekend_min_stay': 1}
#                 print(f"No booking trend data for {month_year_key} in main_df; defaulting to 1-day min stay.")
#         recommendations[month_year_key] = rec
#     print(f"Generated recommendations for {len(recommendations)} months.")
#     return json.dumps(recommendations, indent=2)
#
#
# if __name__ == "__main__":
#     # Database Configuration
#     db_config = {
#         'host': '127.0.0.1',
#         'user': 'root',
#         'password': '',  # Add your password
#         'database': 'pricing'
#     }
#
#     # Default coordinates for process_dataframe
#     default_lat = 42.1713
#     default_lon = -73.9698
#
#     df_main_raw = pd.DataFrame()
#     df_test_single_row = pd.DataFrame()
#     df_processed_main = pd.DataFrame()
#     conn = None
#
#     try:
#         print("Connecting to database for main rental properties data...")
#         conn = mysql.connector.connect(**db_config)
#         # Select only necessary columns if possible, especially 'available_dates' and those needed by process_dataframe
#         query_main = "SELECT id, available_dates, <other_columns_needed_by_process_dataframe> FROM rental_properties"
#         # For testing, let's assume process_dataframe needs all for now, but optimize this query in production.
#         query_main = "SELECT * FROM rental_properties"
#         df_main_raw = pd.read_sql_query(query_main, conn)
#         print(f"Fetched {len(df_main_raw)} rows from rental_properties.")
#
#         if df_main_raw.empty:
#             print("Main rental_properties data is empty. Exiting.")
#             exit()
#
#         print(f"Processing main dataframe with coordinates: lat={default_lat}, long={default_lon}")
#         df_processed_main = process_dataframe(df_main_raw.copy(), lat=default_lat, long=default_lon)
#         print(f"Main dataframe processed. Shape: {df_processed_main.shape}")
#
#         if df_processed_main.empty:
#             print("Processed main dataframe is empty. Exiting.")
#             exit()
#
#         if 'available_dates' not in df_processed_main.columns:
#             print(f"CRITICAL: 'available_dates' column is MISSING from df_processed_main after process_dataframe.")
#             print(f"Columns in df_processed_main: {df_processed_main.columns.tolist()}")
#             print("Exiting as 'available_dates' is essential.")
#             exit()
#
#         print("Connecting to database for test property data (userdynamiclistset)...")
#         if conn is None or not conn.is_connected():
#             conn = mysql.connector.connect(**db_config)
#
#         cursor = conn.cursor(dictionary=True)
#         # Ensure userdynamiclistset has an 'available_dates' column if you want to calculate occupancy from it.
#         # Or, ensure it has columns like 'Mar_2024', 'Apr_2024' with occupancy percentages.
#         query_test = "SELECT *, available_dates FROM userdynamiclistset ORDER BY id LIMIT 1"  # Ensure available_dates is fetched
#         cursor.execute(query_test)
#         test_property_data_dict = cursor.fetchone()
#         cursor.close()
#
#         if test_property_data_dict:
#             df_test_single_row = pd.DataFrame([test_property_data_dict])
#             print(f"Fetched test property data. Columns: {df_test_single_row.columns.tolist()}")
#             if 'available_dates' not in df_test_single_row.columns:
#                 print(
#                     "Warning: 'available_dates' column not found in test_df (userdynamiclistset). Occupancy cannot be calculated if direct occupancy columns are also missing.")
#         else:
#             print("No data found in userdynamiclistset. Using empty test DataFrame.")
#             df_test_single_row = pd.DataFrame()
#
#         if not df_processed_main.empty:
#             print("\nCalling generate_recommendations...")
#             recommendations_json = generate_recommendations(df_processed_main, df_test_single_row)
#             print(f"\n------------------------------------------")
#             print(f"FINAL MINIMUM STAY RECOMMENDATIONS (JSON):")
#             print(recommendations_json)
#             print(f"------------------------------------------")
#         else:
#             print("Cannot generate recommendations: processed main dataframe is empty.")
#
#     except mysql.connector.Error as err:
#         print(f"Database Error: {err}")
#     except FileNotFoundError:
#         print("Error: filter200.py not found. Make sure it's in the correct path and contains process_dataframe.")
#     except ImportError as e:
#         print(f"Import Error: {e}. Check filter200.py and function names.")
#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")
#         import traceback
#
#         traceback.print_exc()
#     finally:
#         if conn and conn.is_connected():
#             conn.close()
#             print("Database connection closed.")


###########################################################################################################


import pandas as pd
import json
from datetime import datetime, timedelta
import re
from collections import defaultdict
from filter200 import select_comparables_for_pricing # Assuming this is in your environment

import mysql.connector
import ast
from calendar import monthrange


# Helper function to categorize stay durations
def get_bin(days):
    if days == 1:
        return '1_day_stays'
    elif days == 2:
        return '2_day_stays'
    elif 3 <= days <= 4:
        return '3_4_day_stays'
    elif 5 <= days <= 6:
        return '5_6_day_stays'
    elif 7 <= days <= 14:
        return '7_14_day_stays'
    elif 15 <= days <= 28:
        return '15_28_day_stays'
    else:
        return '28+_day_stays'


# Robust parsing function (from previous correct versions)
def parse_available_dates_column(date_input):
    try:
        if pd.isna(date_input) or str(date_input).lower() in ['[]', '{}', 'nan', 'nat', 'ns']:
            return []
        if isinstance(date_input, list):
            valid_items = [d for d in date_input if
                           isinstance(d, (str, datetime, pd.Timestamp)) and str(d).lower() != 'ns']
            parsed_dates = [pd.to_datetime(d, errors='coerce') for d in valid_items]
            return [d for d in parsed_dates if pd.notna(d)]
        if isinstance(date_input, str):
            date_str = date_input.strip()
            if not date_str: return []
            if date_str.startswith('DatetimeIndex(['):
                date_pattern = r"'(\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?)?)'"
                extracted_dates = re.findall(date_pattern, date_str)
                parsed_dates = [pd.to_datetime(d, errors='coerce') for d in extracted_dates]
                return [d for d in parsed_dates if pd.notna(d)]
            if (date_str.startswith('[') and date_str.endswith(']')) or \
                    (date_str.startswith('(') and date_str.endswith(')')) or \
                    (date_str.startswith('[') and date_str.endswith('];')):
                if date_str.endswith(';'): date_str = date_str[:-1].strip()
                date_str = re.sub(r'\s*\n\s*', '', date_str)
                date_str = re.sub(r'\s+', ' ', date_str)
                date_str = date_str.replace('","', '", "')
                try:
                    evaluated_data = ast.literal_eval(date_str)
                    if isinstance(evaluated_data, (list, tuple)):
                        valid_items = [d for d in evaluated_data if isinstance(d, str) and d.lower() != 'ns']
                        parsed_dates = [pd.to_datetime(d, errors='coerce') for d in valid_items]
                        return [d for d in parsed_dates if pd.notna(d)]
                except (ValueError, SyntaxError, TypeError) as e_ast:
                    print(f"ast.literal_eval failed for '{date_str}' after cleaning: {e_ast}")  # Keep for debug
            try:
                single_date = pd.to_datetime(date_str, errors='coerce')
                if pd.notna(single_date): return [single_date]
            except Exception:
                pass
        # print(f"Warning: Could not parse date input: '{str(date_input)[:200]}...' of type {type(date_input)}") # Optional
        return []
    except Exception as e:
        # print(f"Unexpected error in parse_available_dates_column for input '{str(date_input)[:200]}...': {e}") # Optional
        return []


# CORRECTED Helper function to calculate monthly occupancy for a single property
def calculate_monthly_occupancy_for_property(property_available_dates_list, relevant_month_year_keys):
    occupancy_data = {}
    if not property_available_dates_list:
        # print("Property has no available_dates provided; cannot calculate its specific occupancy.") # Optional
        return {}

    property_min_date = min(property_available_dates_list).normalize()
    property_max_date = max(property_available_dates_list).normalize()
    available_dates_set = set(d.normalize() for d in property_available_dates_list if pd.notna(d))

    for month_year_key in relevant_month_year_keys:
        try:
            month_dt = datetime.strptime(month_year_key, "%b_%Y")
            year, month = month_dt.year, month_dt.month
        except ValueError:
            # print(f"Warning: Could not parse month_year_key for occupancy calculation: {month_year_key}") # Optional
            continue

        month_start_date = pd.Timestamp(year, month, 1).normalize()
        days_in_month = monthrange(year, month)[1]
        month_end_date = pd.Timestamp(year, month, days_in_month).normalize()

        if month_end_date < property_min_date or month_start_date > property_max_date:
            continue  # Skip months outside the property's actual data range

        available_count_in_month = 0
        for day_num in range(1, days_in_month + 1):
            current_day = pd.Timestamp(year, month, day_num).normalize()
            if property_min_date <= current_day <= property_max_date:  # Ensure day is within property's range
                if current_day in available_dates_set:
                    available_count_in_month += 1

        occupied_in_this_month = days_in_month - available_count_in_month
        occupancy_percentage = (occupied_in_this_month / days_in_month) * 100 if days_in_month > 0 else 0
        occupancy_data[month_year_key] = occupancy_percentage

    return occupancy_data


# Your original generate_recommendations structure with targeted override modification
def generate_recommendations(df, test_df):
    # --- Start of your original data processing for main df ---
    print(
        "Converting 'available_dates' strings to datetime objects for main DataFrame...")  # Changed from "date strings"
    if 'available_dates' not in df.columns:  # Added check
        print("Error: 'available_dates' column not found in the main processed dataframe.")
        return json.dumps({"error": "Missing 'available_dates' column in main df"}, indent=2)
    df['available_dates'] = df['available_dates'].apply(parse_available_dates_column)  # Using robust parser

    dates_count = df['available_dates'].apply(len)
    print(
        f"Main df date counts stats: Min={dates_count.min() if not dates_count.empty else 0}, Max={dates_count.max() if not dates_count.empty else 0}, Mean={dates_count.mean() if not dates_count.empty else 0:.1f}")
    print(f"Main df num listings with dates: {(dates_count > 0).sum()} out of {len(df)}")

    all_main_df_dates = []  # Renamed from all_dates
    for dates_list in df['available_dates']:  # Renamed from dates
        all_main_df_dates.extend([d for d in dates_list if not pd.isna(d)])

    if not all_main_df_dates:
        print("No valid dates found in the main dataset!")  # Changed from "dataset"
        return json.dumps({"message": "No availability data in main df to process"}, indent=2)  # Added message

    main_df_min_date, main_df_max_date = min(all_main_df_dates), max(all_main_df_dates)  # Renamed
    print(f"Overall date range from main df: {main_df_min_date.date()} to {main_df_max_date.date()}")  # Renamed

    month_bins = defaultdict(lambda: defaultdict(int))
    processed_count = 0
    for dates_list in df['available_dates']:  # Renamed from dates
        if not dates_list:
            continue

        # Ensure dates are pd.Timestamp and sorted correctly
        processed_dates = sorted(list(set(d for d in dates_list if pd.notna(d))))
        if not processed_dates:
            continue

        gaps = []
        first_date = processed_dates[0]
        if first_date > main_df_min_date:
            pre_start = main_df_min_date
            pre_end = first_date - pd.Timedelta(days=1)
            if pre_start <= pre_end: gaps.append((pre_start, pre_end))

        for i in range(len(processed_dates) - 1):  # Use len(processed_dates) - 1
            prev_date = processed_dates[i]
            current_date = processed_dates[i + 1]
            if (current_date - prev_date).days > 1:  # Check difference
                gap_start = prev_date + pd.Timedelta(days=1)
                gap_end = current_date - pd.Timedelta(days=1)
                if gap_start <= gap_end: gaps.append((gap_start, gap_end))

        last_date = processed_dates[-1]
        if last_date < main_df_max_date:
            post_start = last_date + pd.Timedelta(days=1)
            post_end = main_df_max_date
            if post_start <= post_end: gaps.append((post_start, post_end))

        for gap_start, gap_end in gaps:
            current_start = gap_start
            while current_start <= gap_end:
                # Corrected month boundary logic slightly
                next_month_first_day = (current_start.replace(day=1) + pd.DateOffset(months=1))
                month_actual_end_in_gap = min(gap_end, next_month_first_day - pd.Timedelta(days=1))

                days_in_gap_segment = (month_actual_end_in_gap - current_start).days + 1
                if days_in_gap_segment <= 0:  # Should not happen with correct logic but as a safe guard
                    current_start = month_actual_end_in_gap + pd.Timedelta(days=1)
                    continue

                bin_name = get_bin(days_in_gap_segment)
                month_year = current_start.strftime('%b_%Y')
                month_bins[month_year][bin_name] += 1
                current_start = month_actual_end_in_gap + pd.Timedelta(days=1)
        processed_count += 1
    print(f"Processed {processed_count} listings from main_df for gap analysis")  # Renamed

    booking_trends = pd.DataFrame.from_dict(month_bins, orient='index').fillna(0).T
    sorted_months_from_main_df = []  # Renamed
    if not booking_trends.empty and booking_trends.columns.any():  # Check columns for transposed df
        sorted_months_from_main_df = sorted(booking_trends.columns,
                                            key=lambda x: datetime.strptime(x, "%b_%Y"))
        booking_trends = booking_trends[sorted_months_from_main_df]
    print(f"Generated gap trends for {len(sorted_months_from_main_df)} months (from main_df)")  # Renamed
    # --- End of your original data processing for main df ---

    # --- MODIFIED OCCUPANCY AND OVERRIDE LOGIC ---
    low_occupancy_triggered = False
    low_occupancy_override_months = []  # Months from test_df to apply override

    test_property_true_occupancies = {}  # Store actual occupancies for test_df

    if not test_df.empty:
        # Attempt to get direct occupancy columns first
        has_direct_occupancy = False
        for month_key in sorted_months_from_main_df:
            if month_key in test_df.columns and pd.api.types.is_numeric_dtype(test_df[month_key]):
                test_property_true_occupancies[month_key] = test_df[month_key].iloc[0]
                has_direct_occupancy = True

        if not has_direct_occupancy and 'available_dates' in test_df.columns:
            print("Test_df lacks direct occupancy columns. Calculating from its 'available_dates'.")
            test_av_dates_raw = test_df['available_dates'].iloc[0]
            test_av_dates_parsed = parse_available_dates_column(test_av_dates_raw)

            if test_av_dates_parsed:
                print(f"Parsed {len(test_av_dates_parsed)} available dates for the test property.")
                # calculate_monthly_occupancy_for_property will only return occupancies
                # for months covered by test_av_dates_parsed.
                calculated_occupancies = calculate_monthly_occupancy_for_property(
                    test_av_dates_parsed,
                    sorted_months_from_main_df  # Pass full timeline, function filters
                )
                test_property_true_occupancies.update(calculated_occupancies)  # Add to or overwrite existing
                if calculated_occupancies:
                    print(f"Calculated occupancies for test property (sample): "
                          f"{ {k: v for i, (k, v) in enumerate(calculated_occupancies.items()) if i < 3} }")
            else:
                print("Could not parse 'available_dates' for test_df or no dates found after parsing.")
        elif not has_direct_occupancy:
            print("Test_df lacks direct occupancy columns and no 'available_dates' column to calculate from.")

        # Now, determine low occupancy based on test_property_true_occupancies
        if test_property_true_occupancies:
            # Get only the months for which test_df has data, then sort them
            test_df_actual_data_months_sorted = sorted(
                test_property_true_occupancies.keys(),
                key=lambda x: datetime.strptime(x, "%b_%Y")
            )

            if test_df_actual_data_months_sorted:  # If there are any such months
                current_month_for_test_df = test_df_actual_data_months_sorted[0]
                current_occ_for_test_df = test_property_true_occupancies[current_month_for_test_df]
                print(
                    f"Test_df's first actual data month: {current_month_for_test_df}, Occupancy: {current_occ_for_test_df:.2f}%")

                next_occ_for_test_df = 100  # Default if only one month of data
                next_month_for_test_df = None
                if len(test_df_actual_data_months_sorted) > 1:
                    next_month_for_test_df = test_df_actual_data_months_sorted[1]
                    next_occ_for_test_df = test_property_true_occupancies[next_month_for_test_df]
                    print(
                        f"Test_df's second actual data month: {next_month_for_test_df}, Occupancy: {next_occ_for_test_df:.2f}%")

                if current_occ_for_test_df < 40 or (next_month_for_test_df and next_occ_for_test_df < 40):
                    low_occupancy_triggered = True
                    low_occupancy_override_months.append(current_month_for_test_df)
                    if next_month_for_test_df:  # Add second month only if it exists and contributed to trigger or is part of the pair
                        low_occupancy_override_months.append(next_month_for_test_df)

                print(f"Low occupancy trigger based on test_df's actual data (<40%): {low_occupancy_triggered}")
                if low_occupancy_triggered:
                    print(f"Months targeted for override due to low occupancy: {low_occupancy_override_months}")
            else:
                print(
                    "No common months with occupancy data found between test_df and main_df's timeline after filtering.")
        else:
            print("No occupancy data (direct or calculated) available for test_df to check low occupancy.")
    else:  # test_df is empty
        print("Test_df is empty. Cannot check for low occupancy.")
    # --- END OF MODIFIED OCCUPANCY AND OVERRIDE LOGIC ---

    # --- Your original recommendation loop ---
    recommendations = {}
    # Use sorted_months_from_main_df for the main loop
    if not sorted_months_from_main_df:  # Check if list is empty
        print("No sorted months from main_df; cannot generate recommendations.")
        return json.dumps({"message": "No timeline for recommendations"}, indent=2)

    for month_key in sorted_months_from_main_df:  # Renamed from month
        # Apply override if this month_key is in the low_occupancy_override_months
        if low_occupancy_triggered and month_key in low_occupancy_override_months:
            rec = {'weekday_min_stay': 1, 'weekend_min_stay': 1}
            print(f"Applying low occupancy override (1-day min stay) for {month_key}")
        else:
            # Your original logic for recommendations based on booking_trends
            if month_key in booking_trends:  # booking_trends columns are months
                top_bins = booking_trends[month_key].nlargest(3)
                min_stay = 28  # Default
                stays = []
                for bin_name in top_bins.index:  # top_bins is a Series, index is bin_name
                    # Original logic: bin_name.split('_')[0] might not be robust if bin is "28+_day_stays"
                    # Corrected to find first number
                    match = re.findall(r'\d+', bin_name)
                    if match:
                        stays.append(int(match[0]))  # Take the first number found
                if stays:
                    min_stay = min(stays)
                rec = {'weekday_min_stay': min_stay, 'weekend_min_stay': min_stay}
            else:
                # Fallback if no trend data for the month in main_df
                rec = {'weekday_min_stay': 1, 'weekend_min_stay': 1}
                # print(f"No booking trend data for {month_key} in main_df; defaulting to 1-day min stay.") # Optional
        recommendations[month_key] = rec

    print(f"Generated recommendations for {len(recommendations)} months")
    return json.dumps(recommendations, indent=2)


# if __name__ == "__main__":
#     # Database Configuration
#     db_config = {
#         'host': '127.0.0.1',
#         'user': 'root',
#         'password': '',  # Add your password
#         'database': 'pricing'
#     }
#     default_lat = 42.1713
#     default_lon = -73.9698
#     df_main_raw = pd.DataFrame()
#     df_test_single_row = pd.DataFrame()
#     df_processed_main = pd.DataFrame()
#     conn = None
#
#     try:
#         print("Connecting to database for main rental properties data...")
#         conn = mysql.connector.connect(**db_config)
#         query_main = "SELECT * FROM rental_properties"
#         df_main_raw = pd.read_sql_query(query_main, conn)
#         print(f"Fetched {len(df_main_raw)} rows from rental_properties.")
#         if df_main_raw.empty: print("Main rental_properties data is empty. Exiting."); exit()
#
#         print(f"Processing main dataframe with coordinates: lat={default_lat}, long={default_lon}")
#         df_processed_main = process_dataframe(df_main_raw.copy(), lat=default_lat, long=default_lon)
#         print(f"Main dataframe processed. Shape: {df_processed_main.shape}")
#         if df_processed_main.empty: print("Processed main dataframe is empty. Exiting."); exit()
#         if 'available_dates' not in df_processed_main.columns:  # Critical check
#             print(f"CRITICAL: 'available_dates' column is MISSING from df_processed_main after process_dataframe.");
#             exit()
#
#         print("Connecting to database for test property data (userdynamiclistset)...")
#         if conn is None or not conn.is_connected(): conn = mysql.connector.connect(**db_config)
#         cursor = conn.cursor(dictionary=True)
#         query_test = "SELECT * FROM userdynamiclistset ORDER BY id LIMIT 1"
#         cursor.execute(query_test)
#         test_property_data_dict = cursor.fetchone()
#         cursor.close()
#
#         if test_property_data_dict:
#             df_test_single_row = pd.DataFrame([test_property_data_dict])
#             print(f"Fetched test property data. Columns: {df_test_single_row.columns.tolist()}")
#             # Basic check for necessary columns for occupancy calculation or direct values
#             has_occ_info = 'available_dates' in df_test_single_row.columns
#             if not has_occ_info:
#                 # Check if any column looks like a direct occupancy column, e.g., "Mar_2025"
#                 month_year_pattern = re.compile(r"^[A-Za-z]{3}_\d{4}$")  # e.g., Mar_2025
#                 if not any(month_year_pattern.match(col) for col in df_test_single_row.columns):
#                     print(
#                         "Warning: 'available_dates' and direct occupancy columns (e.g., Mar_2025) seem missing from test_df.")
#         else:
#             print("No data found in userdynamiclistset. Using empty test DataFrame.")
#             df_test_single_row = pd.DataFrame()  # Ensure it's an empty DataFrame
#
#         if not df_processed_main.empty:  # df_test_single_row can be empty
#             print("\nCalling generate_recommendations...")
#             # The function name in your original code was generate_recommendations, not get_recommended_price
#             recommendations_json = generate_recommendations(df_processed_main, df_test_single_row)
#             print(f"\n------------------------------------------")
#             # The output is JSON string of recommendations, not a single price
#             print(f"FINAL MINIMUM STAY RECOMMENDATIONS (JSON):")
#             print(recommendations_json)
#             print(f"------------------------------------------")
#         else:
#             print("Cannot generate recommendations because the processed main dataframe is empty.")
#
#     except mysql.connector.Error as err:
#         print(f"Database Error: {err}")
#     except FileNotFoundError:
#         print("Error: filter200.py not found. Make sure it's in the correct path.")
#     except ImportError as e:
#         print(f"Import Error: {e}. Check filter200.py and function names.")
#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")
#         import traceback
#
#         traceback.print_exc()
#     finally:
#         if conn and conn.is_connected(): conn.close(); print("Database connection closed.")
















