From c3962e21b9038dea19493831f74b7a2ce267ba4f Mon Sep 17 00:00:00 2001
From: zms <zhangmingsong12@163.com>
Date: Wed, 21 May 2025 11:33:39 +0800
Subject: [PATCH] first commit

---
 rain_forcast_3h.py               | 2854 ++++++++++++++++++++++++++++++
 requirements.txt                 |    8 +
 src/data/data_processor.py       |  142 ++
 src/database/db_connector.py     |  236 +++
 src/forecast/forecast_system.py  |  154 ++
 src/main.py                      |  232 +++
 src/models/rainfall_model.py     |  176 ++
 src/utils/feature_engineering.py |  129 ++
 src/utils/visualization.py       |  113 ++
 9 files changed, 4044 insertions(+)
 create mode 100644 rain_forcast_3h.py
 create mode 100644 requirements.txt
 create mode 100644 src/data/data_processor.py
 create mode 100644 src/database/db_connector.py
 create mode 100644 src/forecast/forecast_system.py
 create mode 100644 src/main.py
 create mode 100644 src/models/rainfall_model.py
 create mode 100644 src/utils/feature_engineering.py
 create mode 100644 src/utils/visualization.py

diff --git a/rain_forcast_3h.py b/rain_forcast_3h.py
new file mode 100644
index 0000000..21c4d10
--- /dev/null
+++ b/rain_forcast_3h.py
@@ -0,0 +1,2854 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+未来多时间点降雨预报系统
+功能：
+1. 预测未来一小时、两小时、三小时是否会降雨
+2. 判断降雨后，分析未来一小时、两小时、三小时的降雨量
+"""
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_predict
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
+from sklearn.metrics import roc_curve, auc, precision_recall_curve
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.preprocessing import StandardScaler, RobustScaler
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.base import clone
+from sklearn.utils import resample
+import joblib
+import os
+import warnings
+import datetime
+from datetime import timedelta
+import time
+import argparse
+import traceback
+warnings.filterwarnings('ignore')
+
+class RainfallTwoStageModel:
+    """
+    两阶段降雨预测模型:
+    - 第一阶段: 分类器预测是否会下雨(二分类)
+    - 第二阶段: 对预测有雨的样本，先分类降雨强度，再使用分层回归模型预测具体降雨量
+    """
+    def __init__(self, debug=True, clf_model_type='ensemble', reg_model_type='gbdt', output_dir='figures'):
+        # 初始化模型组件
+        self.binary_classifier = None  # 二分类器（判断是否下雨）
+        self.regressor = None   # 回归器（单一模型，用作备选）
+        self.regressors = {}    # 分层回归器，针对不同降雨强度
+        self.rain_category_classifier = None  # 降雨类别分类器（小、中、大、暴雨）
+        self.scaler = None  # 特征缩放器
+        self.threshold = 0.5  # 分类阈值
+        self.feature_importances_clf = None  # 分类器特征重要性
+        self.feature_importances_reg = None  # 回归器特征重要性
+        self.feature_names = None  # 特征名称
+        self.debug = debug  # 调试输出
+        self.clf_model_type = clf_model_type  # 分类器类型
+        self.reg_model_type = reg_model_type  # 回归器类型
+        self.log_transform = True  # 是否对降雨量进行对数变换
+        self.feature_stats = {}  # 存储特征统计信息
+        self.stratified_regression = True  # 是否使用分层回归
+        self.rain_thresholds = [0.5, 3.0, 10.0]  # 降雨类别阈值：小雨、中雨、大雨、暴雨
+        self.heavy_rain_factor = 3.0  # 大雨预测安全系数
+        self.extreme_rain_factor = 5.0  # 暴雨预测安全系数
+        self.time_points = [1, 2, 3]  # 添加时间点属性
+
+        # 创建图表输出目录
+        self.output_dir = output_dir
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+    
+    def debug_print(self, message):
+        """调试输出"""
+        if self.debug:
+            print(f"[调试] {message}")
+    
+    def preprocess_data(self, data, silent=False):
+        """
+        数据预处理：特征工程与数据清洗，增强时间序列特征
+        
+        参数:
+            data: DataFrame 原始数据
+            silent: bool 是否抑制输出信息
+            
+        返回:
+            df: DataFrame 处理后的数据
+        """
+        df = data.copy()
+        
+        #print("aaa")
+        #print(df)
+        # 确保datetime列是datetime格式
+        if 'datetime' in df.columns:
+            df['datetime'] = pd.to_datetime(df['datetime'])
+        
+        # 数据类型转换，确保所有特征都是数值型
+        for col in df.columns:
+            if col not in ['datetime', 'timestamp', 'date', 'time']:
+                try:
+                    df[col] = pd.to_numeric(df[col], errors='coerce')
+                except:
+                    if not silent:
+                        print(f"警告: 无法将列 '{col}' 转换为数值类型")
+        
+        # 处理缺失值
+        if 'rain' in df.columns:
+            df = df.dropna(subset=['rain'])  # 删除目标变量缺失的行
+        df = df.fillna(method='ffill').fillna(method='bfill')  # 填充其他缺失值
+        
+        # 温度单位检查（保留原始单位）
+        
+        # 保存或使用特征统计信息
+        if not self.feature_stats:
+            # 训练期间：计算并保存统计数据
+            for var in ['rh', 'PWV', 'Temp', 'Press', 'ZTD']:
+                if var in df.columns:
+                    self.feature_stats[f"{var}_mean"] = df[var].mean()
+                    self.feature_stats[f"{var}_std"] = df[var].std()
+       
+        #print(df.head(10))
+        # 计算全面的时间序列特征
+        # 1. 异常值（相对平均值的标准化偏差）
+        for var in ['rh', 'PWV', 'Temp', 'Press', 'ZTD']:
+            if var in df.columns:
+                # 使用保存的均值或计算新均值
+                mean_value = self.feature_stats.get(f"{var}_mean", df[var].mean())
+                std_value = self.feature_stats.get(f"{var}_std", df[var].std())
+                if std_value > 0:
+                    df[f'{var}_anomaly'] = (df[var] - mean_value) / std_value
+                else:
+                    df[f'{var}_anomaly'] = 0
+        
+        # 2. 多时间尺度的变化率
+        for var in ['rh', 'PWV', 'ZTD', 'Press', 'Temp']:
+            if var in df.columns:
+                # 短期变化（1小时，3小时）
+                df[f'{var}_1h_change'] = df[var].diff(1).fillna(0)
+                df[f'{var}_3h_change'] = df[var].diff(3).fillna(0)
+                
+                # 中期变化（6小时）
+                df[f'{var}_6h_change'] = df[var].diff(6).fillna(0)
+                
+                # 长期变化（12小时）
+                df[f'{var}_12h_change'] = df[var].diff(12).fillna(0)
+                
+                # 变化加速度（变化率的变化率）
+                df[f'{var}_change_accel'] = df[f'{var}_1h_change'].diff(1).fillna(0)
+                
+                # 相对变化（百分比）
+                if df[var].mean() != 0:
+                    df[f'{var}_rel_change'] = df[f'{var}_1h_change'] / df[var].shift(1).fillna(df[var].mean())
+                
+                # 急剧变化指标（针对极端事件）
+                var_std = df[f'{var}_1h_change'].std()
+                if var_std > 0:
+                    df[f'{var}_rapid_change'] = (np.abs(df[f'{var}_1h_change']) > (2 * var_std)).astype(int)
+                else:
+                    df[f'{var}_rapid_change'] = 0
+        
+        # 3. 时间窗口特征 - 多种滚动窗口
+        for var in ['PWV', 'rh', 'Temp', 'Press', 'ZTD']:
+            if var in df.columns:
+                # 短窗口（3小时滚动平均）
+                df[f'{var}_3h_mean'] = df[var].rolling(window=3, min_periods=1).mean()
+                
+                # 中窗口（6小时滚动平均和标准差）
+                df[f'{var}_6h_mean'] = df[var].rolling(window=6, min_periods=1).mean()
+                df[f'{var}_6h_std'] = df[var].rolling(window=6, min_periods=1).std().fillna(0)
+                
+                # 长窗口（12小时滚动平均）
+                df[f'{var}_12h_mean'] = df[var].rolling(window=12, min_periods=1).mean()
+                
+                # 趋势指标：当前值与滚动平均的差异
+                df[f'{var}_vs_6h'] = df[var] - df[f'{var}_6h_mean']
+                
+                # 最大/最小值指标
+                df[f'{var}_12h_max'] = df[var].rolling(window=12, min_periods=1).max()
+                df[f'{var}_12h_min'] = df[var].rolling(window=12, min_periods=1).min()
+                df[f'{var}_12h_range'] = df[f'{var}_12h_max'] - df[f'{var}_12h_min']
+        
+        # 4. 天气模式指标和特征组合
+        if 'PWV' in df.columns and 'rh' in df.columns:
+            # 湿度可用性指数
+            df['PWV_rh'] = df['PWV'] * df['rh'] / 100
+            
+            # 湿度变化指标
+            if 'PWV_1h_change' in df.columns and 'rh_1h_change' in df.columns:
+                df['moisture_change'] = df['PWV_1h_change'] * df['rh_1h_change']
+                
+            # 累积湿度变化
+            df['PWV_cumul_change'] = df['PWV'].diff(1).rolling(window=6).sum().fillna(0)
+            df['rh_cumul_change'] = df['rh'].diff(1).rolling(window=6).sum().fillna(0)
+        
+        if 'PWV' in df.columns and 'Temp' in df.columns:
+            # 温度-湿度相互作用
+            df['PWV_Temp'] = df['PWV'] * df['Temp']
+            
+            # 饱和潜力
+            temp_exp = np.exp(17.27 * (df['Temp']-273.15) / ((df['Temp']-273.15) + 237.3))  # 转换为摄氏度计算
+            df['saturation_index'] = df['PWV'] / (0.1 * temp_exp)
+        
+        if 'Press' in df.columns and 'Press_1h_change' in df.columns:
+            # 气压变化强度（较大的变化表示更强的天气系统）
+            df['Press_change_intensity'] = df['Press_1h_change'].abs()
+            
+            # 气压系统变化：下降的气压往往与降雨系统有关
+            df['Press_falling'] = (df['Press_1h_change'] < 0).astype(int)
+            
+            # 气压快速下降指标（与强降水相关）
+            press_quantile = df['Press_1h_change'].abs().quantile(0.75)
+            df['Press_rapid_fall'] = ((df['Press_1h_change'] < 0) & 
+                                     (df['Press_1h_change'].abs() > press_quantile)).astype(int)
+        
+        if 'ZTD' in df.columns and 'PWV' in df.columns:
+            # ZTD-PWV比率（大气条件指标）
+            df['ZTD_PWV_ratio'] = df['ZTD'] / df['PWV'].replace(0, np.nan).fillna(df['PWV'].mean())
+            
+            # 组合变化指标
+            if 'ZTD_1h_change' in df.columns and 'PWV_1h_change' in df.columns:
+                df['ZTD_PWV_change'] = df['ZTD_1h_change'] * df['PWV_1h_change']
+                
+            # 湿度不稳定指标
+            pwv_mean = df['PWV_6h_mean'].mean()
+            if pwv_mean > 0:
+                df['moisture_instability'] = df['PWV_6h_std'] / df['PWV_6h_mean']
+            else:
+                df['moisture_instability'] = 0
+        
+        # 5. 基于多变量的降雨潜力指标
+        if all(var in df.columns for var in ['PWV', 'rh', 'Temp', 'Press_1h_change']):
+            # 组合降雨指标
+            df['rain_potential'] = (df['PWV'] * df['rh'] / 100) * (0.1 - df['Press_1h_change']) / (df['Temp'] + 10)
+            
+            # 强降雨潜力指标
+            pwv_mean = df['PWV'].mean() + 0.1  # 避免除以零
+            df['heavy_rain_potential'] = df['PWV'] * (df['rh'] / 100) * (df['Press_rapid_fall'] + 1) * (
+                1 + np.abs(df['PWV_1h_change']) / pwv_mean)
+            
+            # 降雨持续性指标
+            df['rain_persistence'] = df['PWV_cumul_change'] * df['rh_cumul_change'] * (
+                1 + df['Press_change_intensity'])
+        
+        # 创建二值标签 - 简单明了：1表示有雨，0表示无雨
+        if 'rain' in df.columns: 
+            df['rain_binary'] = (df['rain'] > 0.0).astype(int)
+            #df.loc[:, 'rain_binary'] = (df['rain'] > 0).astype(int)
+
+
+
+            # 创建降雨类别标签用于分层回归
+            df['rain_category'] = np.zeros(len(df), dtype=int)  # 默认：无雨
+            if self.stratified_regression:
+                df.loc[df['rain'] > 0, 'rain_category'] = 1  # 小雨
+                df.loc[df['rain'] > self.rain_thresholds[0], 'rain_category'] = 2  # 中雨
+                df.loc[df['rain'] > self.rain_thresholds[1], 'rain_category'] = 3  # 大雨
+                df.loc[df['rain'] > self.rain_thresholds[2], 'rain_category'] = 4  # 暴雨
+            
+            # 只在非静默模式下打印信息
+            if not silent:
+                # 打印降雨分布信息
+                rain_count = df['rain_binary'].sum()
+                print(f"\n降雨分布统计：")
+                print(f"总样本数: {len(df)}")
+                print(f"有雨样本: {rain_count} ({rain_count/len(df)*100:.2f}%)")
+                print(f"无雨样本: {len(df) - rain_count} ({(len(df)-rain_count)/len(df)*100:.2f}%)")
+                
+                # 分析降雨量分布和类别
+                rain_samples = df[df['rain'] > 0]
+                if len(rain_samples) > 0:
+                    print(f"\n降雨统计（仅有雨样本）：")
+                    print(f"最小值: {rain_samples['rain'].min():.2f}mm")
+                    print(f"最大值: {rain_samples['rain'].max():.2f}mm")
+                    print(f"平均值: {rain_samples['rain'].mean():.2f}mm")
+                    print(f"中位数: {rain_samples['rain'].median():.2f}mm")
+                    
+                    # 打印降雨类别
+                    if self.stratified_regression:
+                        light_rain = sum((df['rain'] > 0) & (df['rain'] <= self.rain_thresholds[0]))
+                        medium_rain = sum((df['rain'] > self.rain_thresholds[0]) & (df['rain'] <= self.rain_thresholds[1]))
+                        heavy_rain = sum((df['rain'] > self.rain_thresholds[1]) & (df['rain'] <= self.rain_thresholds[2]))
+                        extreme_rain = sum(df['rain'] > self.rain_thresholds[2])
+                        
+                        print(f"\n降雨类别：")
+                        print(f"小雨 (0-{self.rain_thresholds[0]}mm): {light_rain} 样本 ({light_rain/rain_count*100:.2f}%)")
+                        print(f"中雨 ({self.rain_thresholds[0]}-{self.rain_thresholds[1]}mm): {medium_rain} 样本 ({medium_rain/rain_count*100:.2f}%)")
+                        print(f"大雨 ({self.rain_thresholds[1]}-{self.rain_thresholds[2]}mm): {heavy_rain} 样本 ({heavy_rain/rain_count*100:.2f}%)")
+                        print(f"暴雨 (>{self.rain_thresholds[2]}mm): {extreme_rain} 样本 ({extreme_rain/rain_count*100:.2f}%)")
+                    
+                    # 绘制降雨量直方图
+                    plt.figure(figsize=(10, 6))
+                    plt.hist(rain_samples['rain'].values, bins=20)
+                    plt.title('Rainfall Distribution Histogram (Rain Samples Only)')
+                    plt.xlabel('Rainfall (mm)')
+                    plt.ylabel('Number of Samples')
+                    plt.grid(True, alpha=0.3)
+                    plt.savefig(f"{self.output_dir}/rainfall_distribution.png")
+                    plt.close()
+                    
+                    # 对数变换降雨量分析
+                    plt.figure(figsize=(10, 6))
+                    plt.hist(np.log1p(rain_samples['rain'].values), bins=20)
+                    plt.title('Log-Transformed Rainfall Distribution (Rain Samples Only)')
+                    plt.xlabel('log(Rainfall+1)')
+                    plt.ylabel('Number of Samples')
+                    plt.grid(True, alpha=0.3)
+                    plt.savefig(f"{self.output_dir}/rainfall_log_distribution.png")
+                    plt.close()
+        
+        # 如果数据中有时间特征，添加时间相关特征
+        if 'datetime' in df.columns:
+            # 小时特征（一天中的时刻）
+            df['hour'] = df['datetime'].dt.hour
+            df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+            df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+            
+            # 星期特征
+            df['day_of_week'] = df['datetime'].dt.dayofweek
+            df['weekend'] = (df['day_of_week'] >= 5).astype(int)
+            
+            # 月份特征（季节性）
+            df['month'] = df['datetime'].dt.month
+            df['season'] = (df['month'] % 12 + 3) // 3
+            
+            # 季节性循环特征
+            df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+            df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+        
+        return df
+    
+    def select_features(self, data, n_features=30, silent=False):
+        """
+        特征选择：基于相关性和实验结果选择最佳特征子集
+        
+        参数:
+            data: DataFrame 预处理后的数据
+            n_features: int 要选择的特征数量
+            silent: bool 是否抑制输出信息
+            
+        返回:
+            X: DataFrame 选定的特征矩阵
+            y_binary: Series 二值分类标签
+            y_amount: Series 降雨量标签
+            y_category: Series (可选) 降雨类别标签
+        """
+        # 选择基本特征（如果在预定义的feature_names中已存在）
+        if self.feature_names is not None:
+            # 检查预定义特征是否在数据中
+            available_features = [f for f in self.feature_names if f in data.columns]
+            if len(available_features) != len(self.feature_names):
+                missing = set(self.feature_names) - set(available_features)
+                if not silent:
+                    self.debug_print(f"警告：以下预定义特征在数据中缺失：{missing}")
+            selected_features = available_features
+        else:
+            # 选择新特征 - 优先考虑时间序列和变化特征
+            basic_features = ['rh', 'Temp', 'PWV', 'ZTD', 'ws', 'Press']
+            
+            # 不同时间尺度的变化特征
+            change_features = [
+                'PWV_1h_change', 'PWV_3h_change', 'PWV_6h_change', 'PWV_12h_change',
+                'ZTD_1h_change', 'ZTD_3h_change', 'ZTD_6h_change',
+                'Temp_1h_change', 'Temp_6h_change', 
+                'Press_1h_change', 'Press_6h_change',
+                'rh_1h_change', 'rh_6h_change',
+                'PWV_change_accel', 'ZTD_change_accel',
+                'PWV_rapid_change', 'ZTD_rapid_change', 'Press_rapid_change'
+            ]
+            
+            # 趋势和窗口特征
+            window_features = [
+                'PWV_3h_mean', 'PWV_6h_mean', 'PWV_vs_6h',
+                'ZTD_6h_mean', 'ZTD_vs_6h',
+                'Temp_6h_mean', 'Press_6h_mean', 'rh_6h_mean',
+                'PWV_6h_std', 'ZTD_6h_std', 
+                'PWV_12h_range', 'ZTD_12h_range'
+            ]
+            
+            # 异常和标准化特征
+            anomaly_features = [
+                'rh_anomaly', 'PWV_anomaly', 'ZTD_anomaly', 
+                'Temp_anomaly', 'Press_anomaly'
+            ]
+            
+            # 组合特征和指标
+            combined_features = [
+                'PWV_rh', 'PWV_Temp', 'ZTD_PWV_ratio', 'moisture_change',
+                'ZTD_PWV_change', 'Press_change_intensity', 'rain_potential',
+                'saturation_index', 'moisture_instability', 'heavy_rain_potential',
+                'rain_persistence', 'PWV_cumul_change', 'Press_rapid_fall',
+                'Press_falling'
+            ]
+            
+            # 时间相关特征
+            time_features = [
+                'hour', 'hour_sin', 'hour_cos', 
+                'day_of_week', 'weekend', 
+                'month', 'season', 'month_sin', 'month_cos'
+            ]
+            
+            # 合并特征列表并过滤仅存在于数据集中的特征
+            all_features = (basic_features + change_features + window_features + 
+                           anomaly_features + combined_features + time_features)
+            
+            # 过滤出数据中存在的特征
+            selected_features = [f for f in all_features if f in data.columns]
+            
+            # 如果可用特征超过所需数量，根据对降雨预测的价值进行优先排序
+            if len(selected_features) > n_features and 'rain' in data.columns:
+                # 计算与降雨量的相关性
+                try:
+                    rain_corr = data[selected_features].corrwith(data['rain']).abs()
+                    
+                    # 按降雨相关性排序特征
+                    sorted_features = rain_corr.sort_values(ascending=False).index.tolist()
+                except Exception as e:
+                    if not silent:
+                        print(f"计算特征相关性时出错: {str(e)}")
+                    sorted_features = selected_features
+                
+                # 始终保留一些基本特征，无论相关性如何
+                must_have_features = [f for f in [
+                    'PWV', 'ZTD', 'rh', 'Temp', 'PWV_1h_change', 
+                    'ZTD_1h_change', 'Press_1h_change', 'saturation_index',
+                    'heavy_rain_potential', 'rain_persistence'
+                ] if f in selected_features]
+                
+                # 将必备特征与相关性最高的特征结合，避免重复
+                remaining_features = [f for f in sorted_features if f not in must_have_features]
+                
+                # 选择最终特征集
+                selected_features = must_have_features + remaining_features[:n_features - len(must_have_features)]
+            
+            # 如果仍然有太多特征，只取前n_features个
+            if len(selected_features) > n_features:
+                selected_features = selected_features[:n_features]
+                
+            # 保存选择的特征
+            self.feature_names = selected_features
+        
+        # 获取特征矩阵
+        X = data[selected_features].copy()
+        
+        # 获取目标变量
+        y_binary = data['rain_binary'].copy() if 'rain_binary' in data.columns else None
+        y_amount = data['rain'].copy() if 'rain' in data.columns else None
+        y_category = data['rain_category'].copy() if 'rain_category' in data.columns else None
+        
+        # 打印选择的特征
+        if not silent:
+            print(f"\n选择的特征（{len(selected_features)}）：")
+            for i, feature in enumerate(selected_features):
+                print(f"  {i+1}. {feature}")
+        
+        return X, y_binary, y_amount, y_category
+    
+    def split_data(self, X, y_binary, y_amount, y_category=None, time_based=False, test_size=0.2):
+        """数据分割：可选择时间序列分割或随机分割"""
+        if time_based:
+            # 使用时间序列分割 - 假设数据按时间排序
+            train_size = int(len(X) * (1 - test_size))
+            X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
+            y_binary_train, y_binary_test = y_binary.iloc[:train_size], y_binary.iloc[train_size:]
+            y_amount_train, y_amount_test = y_amount.iloc[:train_size], y_amount.iloc[train_size:]
+            
+            if y_category is not None:
+                y_category_train, y_category_test = y_category.iloc[:train_size], y_category.iloc[train_size:]
+            else:
+                y_category_train, y_category_test = None, None
+                
+            print("\n使用基于时间序列的数据分割...")
+        else:
+            # 使用随机分层抽样
+            if y_category is not None:
+                X_train, X_test, y_binary_train, y_binary_test, y_amount_train, y_amount_test, y_category_train, y_category_test = train_test_split(
+                    X, y_binary, y_amount, y_category, test_size=test_size, random_state=42, stratify=y_binary
+                )
+            else:
+                X_train, X_test, y_binary_train, y_binary_test, y_amount_train, y_amount_test = train_test_split(
+                    X, y_binary, y_amount, test_size=test_size, random_state=42, stratify=y_binary
+                )
+                y_category_train, y_category_test = None, None
+            
+            print("\n使用随机分层抽样进行数据分割...")
+        
+        # 输出分割信息
+        print(f"训练集: {len(X_train)} 样本 ({sum(y_binary_train == 1)} 有雨, {sum(y_binary_train == 0)} 无雨)")
+        print(f"测试集: {len(X_test)} 样本 ({sum(y_binary_test == 1)} 有雨, {sum(y_binary_test == 0)} 无雨)")
+        
+        # 检查训练集和测试集中的雨量比例
+        train_rain_ratio = sum(y_binary_train == 1) / len(y_binary_train)
+        test_rain_ratio = sum(y_binary_test == 1) / len(y_binary_test)
+        print(f"训练集降雨比例: {train_rain_ratio:.2%}")
+        print(f"测试集降雨比例: {test_rain_ratio:.2%}")
+        
+        # 如果使用分层回归，打印类别分布
+        if y_category is not None:
+            for cat in sorted(y_category_train.unique()):
+                cat_count_train = sum(y_category_train == cat)
+                cat_count_test = sum(y_category_test == cat)
+                if cat > 0:  # 仅打印降雨类别
+                    cat_name = "小雨" if cat == 1 else "中雨" if cat == 2 else "大雨" if cat == 3 else "暴雨"
+                    print(f"{cat_name}: {cat_count_train} 训练样本, {cat_count_test} 测试样本")
+        
+        return X_train, X_test, y_binary_train, y_binary_test, y_amount_train, y_amount_test, y_category_train, y_category_test
+    
+    def scale_features(self, X_train, X_test):
+        """特征标准化：使用RobustScaler减少异常值的影响"""
+        # 首次调用时初始化缩放器
+        if self.scaler is None:
+            self.scaler = RobustScaler()
+            
+        # 将DataFrame转换为numpy数组
+        if isinstance(X_train, pd.DataFrame):
+            X_train_np = X_train.values
+        else:
+            X_train_np = X_train
+            
+        if isinstance(X_test, pd.DataFrame):
+            X_test_np = X_test.values
+        else:
+            X_test_np = X_test
+        
+        # 应用RobustScaler
+        X_train_scaled = self.scaler.fit_transform(X_train_np)
+        X_test_scaled = self.scaler.transform(X_test_np)
+        
+        return X_train_scaled, X_test_scaled
+    
+    def build_binary_classifier(self):
+        """构建二分类模型（是否下雨）"""
+        if self.clf_model_type == 'lr':
+            # 逻辑回归 - 简单且稳健
+            base_model = LogisticRegression(
+                C=0.05,  # 更强的L2正则化
+                penalty='elasticnet',
+                solver='saga',
+                l1_ratio=0.2,
+                class_weight='balanced', 
+                max_iter=3000,
+                random_state=42
+            )
+            param_grid = {
+                'C': [0.01, 0.05, 0.1, 0.5],
+                'l1_ratio':[0.1, 0.2, 0.3]
+            }
+            
+        elif self.clf_model_type == 'rf':
+            # 随机森林 - 控制复杂度以避免过拟合
+            base_model = RandomForestClassifier(
+                n_estimators=200,
+                max_depth=8,  # 限制树深度
+                min_samples_split=8,
+                min_samples_leaf=4,
+                max_features='sqrt',  # 使用特征子集
+                bootstrap=True,
+                oob_score=True,  # 启用袋外评分
+                class_weight='balanced_subsample',
+                random_state=42
+            )
+            param_grid = {
+                'max_depth': [6, 8, 10],
+                'n_estimators': [150, 200, 250],
+                'min_samples_leaf': [2, 4, 6]
+            }
+            
+        elif self.clf_model_type == 'gbdt':
+            # 梯度提升树 - 通过慢学习率降低过拟合
+            base_model = GradientBoostingClassifier(
+                n_estimators=250,
+                max_depth=4,  # 浅树
+                learning_rate=0.03,  # 慢学习率
+                subsample=0.7,  # 样本子采样
+                min_samples_split=15,
+                min_samples_leaf=10,
+                max_features=0.8,  # 特征子采样
+                random_state=42
+            )
+            param_grid = {
+                'n_estimators': [200, 250, 300],
+                'max_depth': [3, 4, 5],
+                'learning_rate': [0.01, 0.03, 0.05],
+                'subsample': [0.7, 0.8, 0.9]
+            }
+            
+            # 在RainfallTwoStageModel类的build_binary_classifier方法中添加
+        elif self.clf_model_type == 'xgboost':
+            from xgboost import XGBClassifier
+            base_model = XGBClassifier(
+                n_estimators=300,  # 增加树数量
+                max_depth=5,  # 适当增加深度
+                learning_rate=0.03,  # 降低学习率
+                subsample=0.7,  # 减少样本比例
+                colsample_bytree=0.7,  # 减少特征比例
+                min_child_weight=5,  # 增加叶子节点要求
+                gamma=0.2,  # 增加分裂阈值
+                scale_pos_weight=15,  # 提高正样本权重
+                reg_alpha=0.1,  # L1正则化
+                reg_lambda=1.0,  # L2正则化
+                random_state=42,
+                tree_method='hist',  # 更快的直方图算法
+                eval_metric='auc'  # 评估指标
+            )
+            param_grid = {
+                'n_estimators': [250, 300, 350],
+                'max_depth': [4, 5, 6],
+                'min_child_weight': [3, 5, 7],
+                'scale_pos_weight': [10, 15, 20],
+                'subsample': [0.6, 0.7, 0.8]
+            }
+            
+        elif self.clf_model_type == 'ensemble':
+            # 简单投票集成
+            from sklearn.ensemble import VotingClassifier
+            from xgboost import XGBClassifier
+
+            estimators = [
+                ('lr', LogisticRegression(C=0.05, class_weight='balanced', max_iter=3000, solver='saga')),
+                ('rf', RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=4, class_weight='balanced_subsample')),
+                ('gb', GradientBoostingClassifier(n_estimators=200, max_depth=4, learning_rate=0.03, subsample=0.7)),
+                ('xgb', XGBClassifier(n_estimators=250, max_depth=5, learning_rate=0.03, scale_pos_weight=15))
+            ]
+            
+            base_model = VotingClassifier(estimators=estimators, voting='soft')
+            param_grid = {}  # 简化集成模型调优
+            
+        else:
+            raise ValueError(f"不支持的分类器类型: {self.clf_model_type}")
+        
+        return base_model, param_grid
+    
+    def build_regressor(self):
+        """构建回归模型"""
+        if self.reg_model_type == 'rf':
+            # 随机森林回归器
+            base_model = RandomForestRegressor(
+                n_estimators=100,
+                max_depth=5,
+                min_samples_split=5,
+                min_samples_leaf=5,
+                random_state=42
+            )
+            param_grid = {
+                'max_depth': [3, 5, 8],
+                'min_samples_leaf': [3, 5, 10]
+            }
+            
+        elif self.reg_model_type == 'gbdt':
+            # 梯度提升树回归器
+            base_model = GradientBoostingRegressor(
+                n_estimators=100,
+                max_depth=3,
+                learning_rate=0.05,
+                subsample=0.8,
+                loss='squared_error',
+                random_state=42
+            )
+            param_grid = {
+                'max_depth': [2, 3],
+                'learning_rate': [0.01, 0.05, 0.1]
+            }
+            
+        elif self.reg_model_type == 'ridge':
+            # 岭回归 - 简单线性模型
+            base_model = Ridge(
+                alpha=1.0,
+                random_state=42
+            )
+            param_grid = {
+                'alpha': [0.1, 1.0, 10.0]
+            }
+            
+        elif self.reg_model_type == 'xgboost':
+            # XGBoost回归器
+            try:
+                from xgboost import XGBRegressor
+                
+                # 基础模型配置 - 增强了过拟合控制
+                base_model = XGBRegressor(
+                    n_estimators=200,           # 增加树的数量以提高性能
+                    max_depth=4,                # 保持不变
+                    learning_rate=0.05,         # 保持不变
+                    subsample=0.8,              # 保持不变，有助于防止过拟合 
+                    colsample_bytree=0.8,       # 保持不变，有助于防止过拟合
+                    min_child_weight=3,         # 新增：防止在少量样本上过拟合
+                    gamma=0.1,                  # 新增：控制树节点分裂的最小损失减少
+                    reg_alpha=0.1,              # 新增：L1正则化，减少模型复杂性
+                    reg_lambda=1.0,             # 新增：L2正则化，减少模型复杂性
+                    random_state=42,
+                    n_jobs=-1                   # 使用所有CPU核心加速训练
+                )
+                
+                # 增加参数网格搜索范围
+                param_grid = {
+                    'max_depth': [3, 4, 5],
+                    'learning_rate': [0.01, 0.05, 0.1],
+                    'min_child_weight': [1, 3, 5],    # 新增：控制叶节点所需的最小样本权重和
+                    'gamma': [0, 0.1, 0.2],           # 新增：控制分裂所需的最小损失减少
+                    'subsample': [0.7, 0.8, 0.9],     # 新增：每棵树随机采样的比例
+                    'colsample_bytree': [0.7, 0.8, 0.9]  # 新增：每棵树随机采样的特征比例
+                }
+                
+                # 针对不同时间点调整的特定参数
+                for time_point in self.time_points:
+                    if time_point == 1:  # 1小时预测
+                        # 1小时预测结果较好，可使用略微复杂的模型
+                        base_model.max_depth = 5
+                        base_model.learning_rate = 0.1
+                        param_grid = {
+                            'max_depth': [4, 5, 6],
+                            'learning_rate': [0.05, 0.1, 0.15],
+                            'min_child_weight': [1, 3],
+                            'gamma': [0, 0.1]
+                        }
+                    elif time_point == 2:  # 2小时预测
+                        # 2小时预测XGBoost性能略差，增强正则化
+                        base_model.max_depth = 3
+                        base_model.learning_rate = 0.05
+                        base_model.subsample = 0.7
+                        base_model.reg_alpha = 0.5
+                        param_grid = {
+                            'max_depth': [2, 3, 4],
+                            'learning_rate': [0.01, 0.05],
+                            'min_child_weight': [3, 5],
+                            'gamma': [0.1, 0.2]
+                        }
+                    elif time_point >= 3:  # 3小时及以上预测
+                        # 3小时预测需要更强的正则化防止过拟合
+                        base_model.max_depth = 3
+                        base_model.learning_rate = 0.03
+                        base_model.subsample = 0.7
+                        base_model.colsample_bytree = 0.7
+                        base_model.reg_alpha = 1.0
+                        base_model.reg_lambda = 2.0
+                        param_grid = {
+                            'max_depth': [2, 3, 4],
+                            'learning_rate': [0.01, 0.03, 0.05],
+                            'min_child_weight': [3, 5, 7],
+                            'gamma': [0.1, 0.2, 0.3]
+                        }
+                
+                # 如果是对暴雨类别数据的预测，采用特殊设置
+                if hasattr(self, 'current_rain_category') and self.current_rain_category == 'storm':
+                    base_model.max_depth = 2
+                    base_model.learning_rate = 0.01
+                    base_model.min_child_weight = 5
+                    base_model.subsample = 0.6
+                    base_model.colsample_bytree = 0.6
+                    base_model.reg_alpha = 2.0
+                    base_model.reg_lambda = 3.0
+                    param_grid = {
+                        'max_depth': [2, 3],
+                        'learning_rate': [0.005, 0.01],
+                        'min_child_weight': [5, 7]
+                    }
+            except ImportError:
+                print("XGBoost不可用，回退到GBDT")
+                return self.build_regressor()
+            
+        else:
+            raise ValueError(f"不支持的回归器类型: {self.reg_model_type}")
+        
+        return base_model, param_grid
+    
+    def train_binary_classifier(self, X_train, y_train, X_val=None, y_val=None):
+        """训练二分类降雨分类器（判断是否下雨）"""
+        print(f"\n训练降雨二分类器 ({self.clf_model_type})...")
+        
+        # 构建基础模型
+        base_model, param_grid = self.build_binary_classifier()
+        
+        # 如果有参数需要调优且不是集成模型
+        if param_grid and self.clf_model_type != 'ensemble':
+            # 使用GridSearchCV查找最佳参数
+            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+            grid_search = GridSearchCV(
+                estimator=base_model,
+                param_grid=param_grid,
+                cv=cv,
+                scoring='f1',  # 使用F1分数
+                n_jobs=-1
+            )
+            
+            # 训练模型
+            grid_search.fit(X_train, y_train)
+            
+            # 获取最佳模型
+            best_model = grid_search.best_estimator_
+            print(f"最佳分类器参数: {grid_search.best_params_}")
+        else:
+            # 直接使用基础模型
+            best_model = base_model
+            best_model.fit(X_train, y_train)
+        
+        # 应用概率校准 - 使用验证集进行校准
+        if X_val is not None and y_val is not None and hasattr(best_model, "predict_proba"):
+            print("使用验证集进行概率校准...")
+            calibrator = CalibratedClassifierCV(
+                best_model, 
+                method='sigmoid',  # 或 'isotonic'
+                cv='prefit'  # 使用已训练模型
+            )
+            calibrator.fit(X_val, y_val)
+            self.binary_classifier = calibrator
+        else:
+            self.binary_classifier = best_model
+        
+        # 检查特征重要性（如果模型支持）
+        if hasattr(best_model, 'feature_importances_'):
+            self.feature_importances_clf = best_model.feature_importances_
+            
+            # 输出重要特征
+            if self.feature_names is not None:
+                importance_pairs = sorted(zip(self.feature_names, self.feature_importances_clf), 
+                                          key=lambda x: x[1], reverse=True)
+                print("\n分类器特征重要性：")
+                for feature, importance in importance_pairs:
+                    print(f"  {feature}: {importance:.4f}")
+                    
+                # 绘制特征重要性
+                plt.figure(figsize=(12, 8))
+                features = [x[0] for x in importance_pairs]
+                importances = [x[1] for x in importance_pairs]
+                plt.barh(range(len(features)), importances, align='center')
+                plt.yticks(range(len(features)), features)
+                plt.xlabel('Importance')
+                plt.title('Binary Classifier Feature Importance')
+                plt.tight_layout()
+                plt.savefig(f"{self.output_dir}/binary_classifier_feature_importance.png")
+                plt.close()
+                    
+        # 评估训练集性能
+        y_train_pred = self.binary_classifier.predict(X_train)
+        train_accuracy = accuracy_score(y_train, y_train_pred)
+        train_precision = precision_score(y_train, y_train_pred)
+        train_recall = recall_score(y_train, y_train_pred)
+        train_f1 = f1_score(y_train, y_train_pred)
+        
+        print(f"二分类器训练集性能: 准确率={train_accuracy:.4f}, 精确率={train_precision:.4f}, "
+              f"召回率={train_recall:.4f}, F1分数={train_f1:.4f}")
+        
+        return self.binary_classifier
+    
+    def train_category_classifier(self, X_train, y_category, rain_mask):
+        """训练降雨强度类别分类器（小雨、中雨、大雨、暴雨）"""
+        # 仅使用降雨样本进行类别预测
+        X_train_rain = X_train[rain_mask]
+        y_category_rain = y_category[rain_mask]
+        
+        unique_categories = np.unique(y_category_rain)
+        if len(unique_categories) < 2:
+            print(f"警告: 降雨类别样本不足，无法训练分类器 (唯一类别: {unique_categories})")
+            return None
+            
+        print(f"\n训练降雨类别分类器，使用 {len(X_train_rain)} 个降雨样本...")
+        
+        # 计算类别权重 - 反比于类别频率，加强对大雨和暴雨的权重
+        try:
+            # 将类别转换为整数
+            y_category_rain_int = y_category_rain.astype(int)
+            
+            # 计算类别频率
+            class_counts = np.bincount(y_category_rain_int)[1:]  # 跳过0类别
+            total_samples = len(y_category_rain)
+            
+            # 应用不平衡权重 - 大雨和暴雨类别权重更高
+            class_weights = {}
+            for i, count in enumerate(class_counts):
+                category = i + 1  # 类别从1开始
+                # 小雨:1倍，中雨:2倍，大雨:4倍，暴雨:8倍基础权重
+                multiplier = 1 if category == 1 else 2 if category == 2 else 4 if category == 3 else 8
+                if count > 0:
+                    class_weights[category] = (total_samples / (count * len(class_counts))) * multiplier
+                else:
+                    class_weights[category] = 1.0
+                
+            print("类别权重:")
+            for cat, weight in class_weights.items():
+                cat_name = "小雨" if cat == 1 else "中雨" if cat == 2 else "大雨" if cat == 3 else "暴雨"
+                print(f"  {cat_name}: {weight:.2f}")
+        except Exception as e:
+            print(f"计算类别权重时出错: {str(e)}")
+            # 使用默认权重
+            class_weights = {1: 1.0, 2: 2.0, 3: 4.0, 4: 8.0}
+            print("使用默认类别权重")
+        
+        # 对大雨和暴雨样本进行过采样
+        oversampled_X = []
+        oversampled_y = []
+        
+        # 添加所有原始样本
+        oversampled_X.append(X_train_rain)
+        oversampled_y.append(y_category_rain)
+        
+        # 过采样大雨样本
+        try:
+            heavy_mask = (y_category_rain == 3)
+            heavy_count = sum(heavy_mask)
+            if heavy_count > 0 and heavy_count < 100:
+                X_heavy = X_train_rain[heavy_mask]
+                y_heavy = y_category_rain[heavy_mask]
+                
+                # 过采样到100个样本或原始样本的5倍，以较小者为准
+                target_samples = min(100, heavy_count * 5)
+                X_heavy_resampled, y_heavy_resampled = resample(
+                    X_heavy, y_heavy,
+                    replace=True,
+                    n_samples=target_samples,
+                    random_state=42
+                )
+                oversampled_X.append(X_heavy_resampled)
+                oversampled_y.append(y_heavy_resampled)
+                print(f"过采样大雨样本: {heavy_count} -> {target_samples}")
+        except Exception as e:
+            print(f"过采样大雨样本时出错: {str(e)}")
+        
+        # 过采样暴雨样本
+        try:
+            extreme_mask = (y_category_rain == 4)
+            extreme_count = sum(extreme_mask)
+            if extreme_count > 0 and extreme_count < 50:
+                X_extreme = X_train_rain[extreme_mask]
+                y_extreme = y_category_rain[extreme_mask]
+                
+                # 过采样到50个样本或原始样本的10倍，以较小者为准
+                target_samples = min(50, extreme_count * 10)
+                X_extreme_resampled, y_extreme_resampled = resample(
+                    X_extreme, y_extreme,
+                    replace=True,
+                    n_samples=target_samples,
+                    random_state=42
+                )
+                oversampled_X.append(X_extreme_resampled)
+                oversampled_y.append(y_extreme_resampled)
+                print(f"过采样暴雨样本: {extreme_count} -> {target_samples}")
+        except Exception as e:
+            print(f"过采样暴雨样本时出错: {str(e)}")
+        
+        # 合并所有样本
+        try:
+            if len(oversampled_X) > 1:
+                X_train_rain_balanced = np.vstack(oversampled_X)
+                y_category_rain_balanced = np.hstack(oversampled_y)
+            else:
+                X_train_rain_balanced = X_train_rain
+                y_category_rain_balanced = y_category_rain
+                
+            print(f"平衡后的训练集大小: {len(X_train_rain_balanced)}")
+        except Exception as e:
+            print(f"合并过采样数据时出错: {str(e)}")
+            # 使用原始数据
+            X_train_rain_balanced = X_train_rain
+            y_category_rain_balanced = y_category_rain
+            print("使用原始未平衡数据")
+        
+        # 使用随机森林分类器进行类别预测，增加树的深度和数量
+        classifier = RandomForestClassifier(
+            n_estimators=200,  # 增加树的数量
+            max_depth=10,      # 增加深度
+            min_samples_leaf=2,
+            class_weight=class_weights,  # 使用类别权重
+            random_state=42
+        )
+        
+        # 训练分类器
+        classifier.fit(X_train_rain_balanced, y_category_rain_balanced)
+        
+        # 检查性能
+        y_pred = classifier.predict(X_train_rain)
+        accuracy = accuracy_score(y_category_rain, y_pred)
+        
+        print(f"降雨类别分类器训练准确率: {accuracy:.4f}")
+        
+        # 检查类别分布
+        try:
+            class_counts = np.bincount(y_category_rain.astype(int))
+            class_predictions = np.bincount(y_pred.astype(int))
+            
+            # 确保结果具有相同的大小
+            if len(class_counts) > len(class_predictions):
+                class_predictions = np.pad(class_predictions, (0, len(class_counts) - len(class_predictions)))
+            elif len(class_predictions) > len(class_counts):
+                class_counts = np.pad(class_counts, (0, len(class_predictions) - len(class_counts)))
+            
+            print("降雨类别分布：")
+            categories = ["无雨", "小雨", "中雨", "大雨", "暴雨"]
+            for i in range(1, min(len(class_counts), len(categories))):  # 跳过"无雨"(0)
+                print(f"  {categories[i]}: {class_counts[i]} 实际, {class_predictions[i]} 预测")
+        except Exception as e:
+            print(f"统计类别分布时出错: {str(e)}")
+        
+        # 混淆矩阵
+        try:
+            confusion = confusion_matrix(y_category_rain, y_pred)
+            print("\n类别分类器混淆矩阵:")
+            
+            # 显示混淆矩阵，标签为雨量类别
+            unique_cats = sorted(np.unique(np.concatenate([y_category_rain, y_pred])))
+            label_names = ["小雨" if cat == 1 else "中雨" if cat == 2 else "大雨" if cat == 3 else "暴雨" for cat in unique_cats]
+            
+            print("预测 →")
+            print("实际 ↓")
+            print("      " + " ".join(f"{name:8s}" for name in label_names))
+            for i, name in enumerate(label_names):
+                row_values = " ".join(f"{confusion[i, j]:8d}" for j in range(len(label_names)))
+                print(f"{name:6s} {row_values}")
+        except Exception as e:
+            print(f"计算混淆矩阵时出错: {str(e)}")
+        
+        # 保存分类器
+        self.rain_category_classifier = classifier
+        
+        return classifier
+    
+    def train_stratified_regressors(self, X_train, y_train, y_category, rain_mask):
+        """训练针对不同降雨强度的分层回归模型"""
+        print("\n训练分层降雨回归器...")
+        
+        # 获取降雨样本
+        X_train_rain = X_train[rain_mask]
+        y_train_rain = y_train[rain_mask]
+        y_category_rain = y_category[rain_mask]
+        
+        # 应用降雨量的对数变换
+        if self.log_transform:
+            print("对降雨量应用对数变换 (log(rain+1))...")
+            y_train_rain_transformed = np.log1p(y_train_rain)
+        else:
+            y_train_rain_transformed = y_train_rain
+        
+        # 检查降雨类别的分布
+        categories = sorted(np.unique(y_category_rain))
+        for cat in categories:
+            if cat == 0:  # 跳过无雨类别
+                continue
+                
+            cat_mask = (y_category_rain == cat)
+            count = sum(cat_mask)
+            
+            cat_name = "小雨" if cat == 1 else "中雨" if cat == 2 else "大雨" if cat == 3 else "暴雨"
+            print(f"{cat_name}样本: {count}")
+            
+            if count < 10:
+                print(f"  警告: {cat_name}样本太少，无法训练可靠模型")
+        
+        # 为每个降雨类别构建并训练模型
+        base_model, _ = self.build_regressor()
+        self.regressors = {}
+        
+        for cat in categories:
+            if cat == 0:  # 跳过无雨类别
+                continue
+                
+            cat_mask = (y_category_rain == cat)
+            count = sum(cat_mask)
+            
+            if count < 5:
+                print(f"跳过样本数量为{count}的类别{cat}，样本太少")
+                continue  # 跳过样本太少的类别
+                
+            cat_name = "light" if cat == 1 else "medium" if cat == 2 else "heavy" if cat == 3 else "extreme"
+            reg_name = "light" if cat == 1 else "medium" if cat == 2 else "heavy" if cat == 3 else "extreme"
+            
+            # 对大雨和暴雨样本进行过采样
+            try:
+                if cat >= 3 and count < 100:  # 大雨和暴雨样本
+                    # 简单过采样
+                    X_rain_cat = X_train_rain[cat_mask]
+                    y_rain_cat = y_train_rain_transformed[cat_mask]
+                    
+                    # 过采样量取决于类别：大雨5倍，暴雨10倍
+                    multiplier = 5 if cat == 3 else 10
+                    target_samples = min(100, count * multiplier)
+                    X_resampled, y_resampled = resample(
+                        X_rain_cat, y_rain_cat,
+                        replace=True,
+                        n_samples=target_samples,
+                        random_state=42
+                    )
+                    print(f"\n训练{cat_name}回归器，使用 {count} 个原始样本, 过采样至 {target_samples} 个样本...")
+                    
+                    # 克隆基础模型并训练
+                    regressor = clone(base_model)
+                    regressor.fit(X_resampled, y_resampled)
+                else:
+                    print(f"\n训练{cat_name}回归器，使用 {count} 个样本...")
+                    
+                    # 克隆基础模型并训练
+                    regressor = clone(base_model)
+                    regressor.fit(X_train_rain[cat_mask], y_train_rain_transformed[cat_mask])
+            except Exception as e:
+                print(f"训练{cat_name}回归器时出错: {str(e)}")
+                # 尝试使用基本训练方法（无过采样）
+                try:
+                    print(f"尝试使用基本训练方法...")
+                    regressor = clone(base_model)
+                    regressor.fit(X_train_rain[cat_mask], y_train_rain_transformed[cat_mask])
+                except Exception as e2:
+                    print(f"基本训练方法也失败: {str(e2)}")
+                    continue  # 跳过此类别
+            
+            # 存储训练好的模型
+            self.regressors[reg_name] = regressor
+            
+            # 在训练数据上评估
+            try:
+                y_pred_transformed = regressor.predict(X_train_rain[cat_mask])
+                
+                if self.log_transform:
+                    y_pred = np.expm1(y_pred_transformed)
+                else:
+                    y_pred = y_pred_transformed
+                    
+                # 防止负值
+                y_pred = np.maximum(y_pred, 0)
+                
+                # 计算评估指标
+                rmse = np.sqrt(mean_squared_error(y_train_rain[cat_mask], y_pred))
+                mae = mean_absolute_error(y_train_rain[cat_mask], y_pred)
+                r2 = r2_score(y_train_rain[cat_mask], y_pred)
+                
+                print(f"{cat_name}回归器训练指标: RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")
+                
+                # 如果模型有特征重要性，保存它们
+                if hasattr(regressor, 'feature_importances_') and self.feature_names is not None:
+                    importance_pairs = sorted(zip(self.feature_names, regressor.feature_importances_), 
+                                            key=lambda x: x[1], reverse=True)
+                    print(f"\n{cat_name}回归器特征重要性（前10）：")
+                    for feature, importance in importance_pairs[:10]:
+                        print(f"  {feature}: {importance:.4f}")
+                    
+                    # 绘制特征重要性
+                    plt.figure(figsize=(12, 8))
+                    features = [x[0] for x in importance_pairs[:15]]
+                    importances = [x[1] for x in importance_pairs[:15]]
+                    plt.barh(range(len(features)), importances, align='center')
+                    plt.yticks(range(len(features)), features)
+                    plt.xlabel('Importance')
+                    plt.title(f'{cat_name} Rain Regressor Feature Importance')
+                    plt.tight_layout()
+                    plt.savefig(f"{self.output_dir}/{reg_name}_regressor_importance.png")
+                    plt.close()
+                    
+                    # 绘制实际值与预测值
+                    plt.figure(figsize=(10, 6))
+                    plt.scatter(y_train_rain[cat_mask], y_pred, alpha=0.6)
+                    max_val = max(max(y_train_rain[cat_mask]), max(y_pred))
+                    plt.plot([0, max_val], [0, max_val], 'r--')
+                    plt.xlabel('Actual Rainfall (mm)')
+                    plt.ylabel('Predicted Rainfall (mm)')
+                    plt.title(f'{cat_name} Rain - Actual vs Predicted (Training)')
+                    plt.grid(True)
+                    plt.tight_layout()
+                    plt.savefig(f"{self.output_dir}/{reg_name}_train_performance.png")
+                    plt.close()
+            except Exception as e:
+                print(f"评估{cat_name}回归器时出错: {str(e)}")
+        
+        return self.regressors
+    
+    def train_regressor(self, X_train, y_train, rain_mask):
+        """训练降雨量回归器，仅使用降雨样本"""
+        # 获取降雨样本
+        X_train_rain = X_train[rain_mask]
+        y_train_rain = y_train[rain_mask]
+        
+        if len(X_train_rain) < 10:
+            print("警告: 降雨样本太少，无法训练回归模型")
+            return None
+        
+        print(f"\n训练降雨量回归器 ({self.reg_model_type})，使用 {len(X_train_rain)} 个降雨样本...")
+        
+        # 考虑降雨量的对数变换
+        log_transform = True  # 默认使用对数变换
+        self.log_transform = log_transform
+        
+        if log_transform:
+            print("对降雨量应用对数变换 (log(rain+1))...")
+            y_train_rain_transformed = np.log1p(y_train_rain)
+        else:
+            y_train_rain_transformed = y_train_rain
+        
+        # 构建回归模型
+        base_model, param_grid = self.build_regressor()
+        
+        # 如果样本足够多，调整参数
+        try:
+            if len(X_train_rain) >= 50 and param_grid:
+                from sklearn.model_selection import KFold
+                cv = KFold(n_splits=5, shuffle=True, random_state=42)
+                grid_search = GridSearchCV(
+                    estimator=base_model,
+                    param_grid=param_grid,
+                    cv=cv,
+                    scoring='neg_mean_squared_error',
+                    n_jobs=-1
+                )
+                
+                # 训练模型
+                grid_search.fit(X_train_rain, y_train_rain_transformed)
+                
+                # 获取最佳模型
+                best_model = grid_search.best_estimator_
+                print(f"最佳回归器参数: {grid_search.best_params_}")
+            else:
+                # 直接使用基础模型
+                best_model = base_model
+                best_model.fit(X_train_rain, y_train_rain_transformed)
+            
+            # 保存模型
+            self.regressor = best_model
+        except Exception as e:
+            print(f"训练回归器时出错: {str(e)}")
+            print(f"尝试使用简化的回归器...")
+            # 使用简化的回归器
+            try:
+                simple_model = RandomForestRegressor(n_estimators=50, max_depth=3, random_state=42)
+                simple_model.fit(X_train_rain, y_train_rain_transformed)
+                self.regressor = simple_model
+            except Exception as e2:
+                print(f"简化回归器训练也失败: {str(e2)}")
+                return None
+        
+        # 检查特征重要性（如果模型支持）
+        if hasattr(self.regressor, 'feature_importances_'):
+            self.feature_importances_reg = self.regressor.feature_importances_
+            
+            # 输出重要特征
+            if self.feature_names is not None:
+                importance_pairs = sorted(zip(self.feature_names, self.feature_importances_reg), 
+                                          key=lambda x: x[1], reverse=True)
+                print("\n回归器特征重要性：")
+                for feature, importance in importance_pairs:
+                    print(f"  {feature}: {importance:.4f}")
+                
+                # 绘制特征重要性
+                plt.figure(figsize=(12, 8))
+                features = [x[0] for x in importance_pairs]
+                importances = [x[1] for x in importance_pairs]
+                plt.barh(range(len(features)), importances, align='center')
+                plt.yticks(range(len(features)), features)
+                plt.xlabel('Importance')
+                plt.title('Regressor Feature Importance')
+                plt.tight_layout()
+                plt.savefig(f"{self.output_dir}/regressor_feature_importance.png")
+                plt.close()
+        
+        # 评估训练集性能
+        try:
+            if log_transform:
+                y_train_rain_pred_transformed = self.regressor.predict(X_train_rain)
+                y_train_rain_pred = np.expm1(y_train_rain_pred_transformed)
+            else:
+                y_train_rain_pred = self.regressor.predict(X_train_rain)
+            
+            # 计算评估指标
+            train_rmse = np.sqrt(mean_squared_error(y_train_rain, y_train_rain_pred))
+            train_mae = mean_absolute_error(y_train_rain, y_train_rain_pred)
+            train_r2 = r2_score(y_train_rain, y_train_rain_pred)
+            
+            print(f"回归器训练集性能: RMSE={train_rmse:.4f}, MAE={train_mae:.4f}, R²={train_r2:.4f}")
+            
+            # 绘制训练集的实际值与预测值对比图
+            plt.figure(figsize=(10, 6))
+            plt.scatter(y_train_rain, y_train_rain_pred, alpha=0.5)
+            max_val = max(max(y_train_rain), max(y_train_rain_pred))
+            plt.plot([0, max_val], [0, max_val], 'r--')
+            plt.xlabel('Actual Rainfall (mm)')
+            plt.ylabel('Predicted Rainfall (mm)')
+            plt.title('Actual vs Predicted Rainfall - Training Set')
+            plt.grid(True)
+            plt.savefig(f"{self.output_dir}/regressor_train_performance.png")
+            plt.close()
+        except Exception as e:
+            print(f"评估回归器性能时出错: {str(e)}")
+        
+        return self.regressor
+    
+    def optimize_threshold(self, X_test, y_test):
+        """优化分类阈值"""
+        print("\n优化分类阈值...")
+        
+        # 获取预测概率
+        y_pred_proba = self.binary_classifier.predict_proba(X_test)[:, 1]
+        
+        # 分析预测概率分布
+        print(f"预测概率分布: 最小值={y_pred_proba.min():.4f}, 最大值={y_pred_proba.max():.4f}, 平均值={y_pred_proba.mean():.4f}")
+        
+        # 计算有雨和无雨样本的平均预测概率
+        rain_probs = y_pred_proba[y_test == 1]
+        no_rain_probs = y_pred_proba[y_test == 0]
+        
+        if len(rain_probs) > 0:
+            print(f"有雨样本的平均预测概率: {rain_probs.mean():.4f}")
+        if len(no_rain_probs) > 0:
+            print(f"无雨样本的平均预测概率: {no_rain_probs.mean():.4f}")
+        
+        # 绘制概率分布直方图
+        plt.figure(figsize=(10, 6))
+        if len(no_rain_probs) > 0:
+            plt.hist(no_rain_probs, bins=20, alpha=0.5, label='No Rain Samples', color='blue')
+        if len(rain_probs) > 0:
+            plt.hist(rain_probs, bins=20, alpha=0.5, label='Rain Samples', color='red')
+        plt.xlabel('Prediction Probability')
+        plt.ylabel('Number of Samples')
+        plt.title('Prediction Probability Distribution')
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+        plt.savefig(f"{self.output_dir}/prediction_probability_distribution.png")
+        plt.close()
+        
+        # 绘制ROC曲线
+        fpr, tpr, thresholds_roc = roc_curve(y_test, y_pred_proba)
+        roc_auc = auc(fpr, tpr)
+        
+        plt.figure(figsize=(10, 6))
+        plt.plot(fpr, tpr, color='darkorange', lw=2, 
+                 label=f'ROC Curve (AUC = {roc_auc:.4f})')
+        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
+        plt.xlim([0.0, 1.0])
+        plt.ylim([0.0, 1.05])
+        plt.xlabel('False Positive Rate')
+        plt.ylabel('True Positive Rate')
+        plt.title('Receiver Operating Characteristic Curve')
+        plt.legend(loc="lower right")
+        plt.grid(True, alpha=0.3)
+        plt.savefig(f"{self.output_dir}/roc_curve.png")
+        plt.close()
+        
+        # 计算不同阈值的F1分数
+        thresholds = np.linspace(0.05, 0.95, 19)
+        f1_scores = []
+        precision_scores = []
+        recall_scores = []
+        
+        for threshold in thresholds:
+            y_pred = (y_pred_proba >= threshold).astype(int)
+            f1 = f1_score(y_test, y_pred)
+            precision = precision_score(y_test, y_pred)
+            recall = recall_score(y_test, y_pred)
+            
+            f1_scores.append(f1)
+            precision_scores.append(precision)
+            recall_scores.append(recall)
+            
+            print(f"阈值={threshold:.2f}: 精确率={precision:.4f}, 召回率={recall:.4f}, F1分数={f1:.4f}")
+        
+        # 绘制不同阈值的性能曲线
+        plt.figure(figsize=(10, 6))
+        plt.plot(thresholds, precision_scores, 'b-', marker='o', label='Precision')
+        plt.plot(thresholds, recall_scores, 'g-', marker='s', label='Recall')
+        plt.plot(thresholds, f1_scores, 'r-', marker='^', label='F1 Score')
+        plt.xlabel('Threshold')
+        plt.ylabel('Score')
+        plt.title('Performance at Different Thresholds')
+        plt.legend()
+        plt.grid(True)
+        plt.savefig(f"{self.output_dir}/threshold_performance.png")
+        plt.close()
+        
+        # 找到F1分数最高的阈值
+        best_idx = np.argmax(f1_scores)
+        self.threshold = thresholds[best_idx]
+        
+        print(f"最佳阈值: {self.threshold:.4f} (F1分数: {f1_scores[best_idx]:.4f})")
+        
+        return self.threshold
+    
+    def evaluate_binary_classifier(self, X_test, y_test):
+        """评估二分类器性能"""
+        print("\n评估二分类器性能...")
+        
+        # 预测测试集
+        y_pred_proba = self.binary_classifier.predict_proba(X_test)[:, 1]
+        y_pred = (y_pred_proba >= self.threshold).astype(int)
+        
+        # 基本分类指标
+        accuracy = accuracy_score(y_test, y_pred)
+        precision = precision_score(y_test, y_pred)
+        recall = recall_score(y_test, y_pred)
+        f1 = f1_score(y_test, y_pred)
+        
+        print("二分类性能指标：")
+        print(f"准确率: {accuracy:.4f}")
+        print(f"精确率: {precision:.4f}")
+        print(f"召回率: {recall:.4f}")
+        print(f"F1分数: {f1:.4f}")
+        
+        # 混淆矩阵
+        cm = confusion_matrix(y_test, y_pred)
+        plt.figure(figsize=(8, 6))
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+        plt.title('Binary Classifier Confusion Matrix')
+        plt.ylabel('True Label')
+        plt.xlabel('Predicted Label')
+        plt.savefig(f"{self.output_dir}/binary_confusion_matrix.png")
+        plt.close()
+        
+        # 预测统计
+        true_rain_count = np.sum(y_test == 1)
+        pred_rain_count = np.sum(y_pred == 1)
+        correct_rain_count = np.sum((y_test == 1) & (y_pred == 1))
+        
+        print("\n预测统计：")
+        print(f"总样本数: {len(y_test)}")
+        print(f"实际有雨样本: {true_rain_count} ({true_rain_count/len(y_test)*100:.2f}%)")
+        print(f"预测有雨样本: {pred_rain_count} ({pred_rain_count/len(y_test)*100:.2f}%)")
+        
+        if true_rain_count > 0:
+            print(f"正确预测有雨: {correct_rain_count} ({correct_rain_count/true_rain_count*100:.2f}% 实际有雨样本)")
+        
+        return {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1': f1,
+            'confusion_matrix': cm,
+            'predictions': y_pred,
+            'probabilities': y_pred_proba
+        }
+    
+    def evaluate_category_classifier(self, X_test, y_category_test, mask_true, mask_pred):
+        """评估降雨类别分类器性能"""
+        if self.rain_category_classifier is None:
+            print("类别分类器未训练，无法评估")
+            return None
+            
+        # 只评估正确识别为降雨的样本
+        mask_both = mask_true & mask_pred
+        
+        if sum(mask_both) == 0:
+            print("没有正确分类的降雨样本，无法评估类别分类器")
+            return None
+            
+        X_test_rain = X_test[mask_both]
+        y_category_test_rain = y_category_test[mask_both]
+        
+        # 进行预测
+        y_category_pred = self.rain_category_classifier.predict(X_test_rain)
+        
+        # 计算准确率
+        accuracy = accuracy_score(y_category_test_rain, y_category_pred)
+        
+        print("\n降雨类别分类器评估：")
+        print(f"准确率: {accuracy:.4f}")
+        
+        # 混淆矩阵
+        try:
+            categories = sorted(np.unique(np.concatenate([y_category_test_rain, y_category_pred])))
+            cm = confusion_matrix(y_category_test_rain, y_category_pred, labels=categories)
+            
+            # 打印混淆矩阵
+            category_names = ["小雨", "中雨", "大雨", "暴雨"]
+            print("\n混淆矩阵：")
+            print("预测 →")
+            print("实际 ↓")
+            
+            # 创建表头
+            header = "      " + " ".join(f"{category_names[cat-1]:8s}" for cat in categories if cat > 0 and cat-1 < len(category_names))
+            print(header)
+            print("-" * len(header))
+            
+            for i, cat in enumerate(categories):
+                if cat > 0 and cat-1 < len(category_names):
+                    row = f"{category_names[cat-1]:6s} "
+                    row += " ".join(f"{cm[i, j]:8d}" for j in range(len(categories)))
+                    print(row)
+            
+            # 绘制混淆矩阵热图
+            plt.figure(figsize=(10, 8))
+            category_labels = [category_names[cat-1] if cat > 0 and cat-1 < len(category_names) else f"类别{cat}" 
+                              for cat in categories]
+            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
+                       xticklabels=category_labels,
+                       yticklabels=category_labels)
+            plt.title('Rain Category Classifier Confusion Matrix')
+            plt.ylabel('True Category')
+            plt.xlabel('Predicted Category')
+            plt.tight_layout()
+            plt.savefig(f"{self.output_dir}/category_confusion_matrix.png")
+            plt.close()
+        except Exception as e:
+            print(f"绘制类别混淆矩阵时出错: {str(e)}")
+        
+        return {
+            'accuracy': accuracy,
+            'confusion_matrix': cm if 'cm' in locals() else None,
+            'predictions': y_category_pred
+        }
+    
+    def evaluate_stratified_regressors(self, X_test, y_test, y_category_test, mask_true, mask_pred):
+        """评估分层回归模型的性能"""
+        if not self.regressors:
+            print("分层回归器未训练，无法评估")
+            return None
+            
+        print("\n评估分层回归器性能...")
+        
+        # 只评估正确识别为降雨的样本
+        mask_both = mask_true & mask_pred
+        
+        if sum(mask_both) == 0:
+            print("没有正确分类的降雨样本，无法评估回归器")
+            return None
+            
+        X_test_rain = X_test[mask_both]
+        y_test_rain = y_test[mask_both]
+        y_category_test_rain = y_category_test[mask_both]
+        
+        # 准备存储预测结果
+        y_pred_rain = np.zeros_like(y_test_rain)
+        
+        # 对于每个降雨样本，根据实际类别使用适当的回归器
+        try:
+            for i, (x, category) in enumerate(zip(X_test_rain, y_category_test_rain)):
+                reg_name = "light" if category == 1 else "medium" if category == 2 else "heavy" if category == 3 else "extreme"
+                
+                # 检查是否有该类别的回归器
+                if reg_name in self.regressors:
+                    regressor = self.regressors[reg_name]
+                    
+                    # 进行预测
+                    x_reshaped = x.reshape(1, -1)
+                    if self.log_transform:
+                        pred_log = regressor.predict(x_reshaped)[0]
+                        pred = np.expm1(pred_log)
+                    else:
+                        pred = regressor.predict(x_reshaped)[0]
+                    
+                    # 确保预测值非负
+                    pred = max(0, pred)
+                    
+                    # 对大雨和暴雨应用安全系数
+                    if category == 3:  # 大雨
+                        pred = pred * self.heavy_rain_factor
+                    elif category == 4:  # 暴雨
+                        pred = pred * self.extreme_rain_factor
+                    
+                    y_pred_rain[i] = pred
+                else:
+                    # 该类别没有模型，使用默认模型（如果有）
+                    if self.regressor is not None:
+                        x_reshaped = x.reshape(1, -1)
+                        if self.log_transform:
+                            pred_log = self.regressor.predict(x_reshaped)[0]
+                            pred = np.expm1(pred_log)
+                        else:
+                            pred = self.regressor.predict(x_reshaped)[0]
+                        
+                        # 如果是大雨或暴雨类别但没有专门的回归器，仍应用安全系数
+                        if category == 3:  # 大雨
+                            pred = pred * self.heavy_rain_factor
+                        elif category == 4:  # 暴雨
+                            pred = pred * self.extreme_rain_factor
+                        
+                        y_pred_rain[i] = max(0, pred)
+        except Exception as e:
+            print(f"分层回归预测时出错: {str(e)}")
+            print("尝试使用简化的预测方法...")
+            
+            # 使用单一回归器进行所有预测
+            if self.regressor is not None:
+                try:
+                    if self.log_transform:
+                        y_pred_transformed = self.regressor.predict(X_test_rain)
+                        y_pred_rain = np.expm1(y_pred_transformed)
+                    else:
+                        y_pred_rain = self.regressor.predict(X_test_rain)
+                    y_pred_rain = np.maximum(y_pred_rain, 0)
+                except Exception as e2:
+                    print(f"单一回归器预测也失败: {str(e2)}")
+                    return None
+            else:
+                return None
+        
+        # 计算总体指标
+        try:
+            rmse = np.sqrt(mean_squared_error(y_test_rain, y_pred_rain))
+            mae = mean_absolute_error(y_test_rain, y_pred_rain)
+            r2 = r2_score(y_test_rain, y_pred_rain)
+            
+            print("\n分层回归器总体性能：")
+            print(f"样本数: {len(y_test_rain)}")
+            print(f"RMSE: {rmse:.4f}")
+            print(f"MAE: {mae:.4f}")
+            print(f"R²: {r2:.4f}")
+        except Exception as e:
+            print(f"计算分层回归器总体性能时出错: {str(e)}")
+            rmse, mae, r2 = float('nan'), float('nan'), float('nan')
+        
+        # 计算各类别的指标
+        try:
+            for category in sorted(np.unique(y_category_test_rain)):
+                cat_mask = (y_category_test_rain == category)
+                if sum(cat_mask) < 5:
+                    continue  # 跳过样本过少的类别
+                    
+                cat_name = "小雨" if category == 1 else "中雨" if category == 2 else "大雨" if category == 3 else "暴雨"
+                
+                cat_rmse = np.sqrt(mean_squared_error(y_test_rain[cat_mask], y_pred_rain[cat_mask]))
+                cat_mae = mean_absolute_error(y_test_rain[cat_mask], y_pred_rain[cat_mask])
+                cat_r2 = r2_score(y_test_rain[cat_mask], y_pred_rain[cat_mask])
+                
+                print(f"\n{cat_name}性能：")
+                print(f"样本数: {sum(cat_mask)}")
+                print(f"RMSE: {cat_rmse:.4f}")
+                print(f"MAE: {cat_mae:.4f}")
+                print(f"R²: {cat_r2:.4f}")
+                
+                # 打印一些示例预测
+                if sum(cat_mask) >= 3:
+                    sample_indices = np.where(cat_mask)[0][:3]  # 最多3个示例
+                    print(f"\n{cat_name}示例预测：")
+                    for idx in sample_indices:
+                        # 使用索引从数组或Series获取值
+                        if isinstance(y_test_rain, pd.Series):
+                            actual = y_test_rain.iloc[idx]
+                        else:
+                            actual = y_test_rain[idx]
+                        
+                        pred = y_pred_rain[idx]
+                        error_pct = ((pred-actual)/actual*100) if actual > 0 else 0
+                        print(f"  实际: {actual:.2f}mm, 预测: {pred:.2f}mm, 误差: {pred-actual:.2f}mm ({error_pct:.1f}%)")
+        except Exception as e:
+            print(f"计算分层回归器分类性能时出错: {str(e)}")
+        
+        # 绘制实际值与预测值
+        try:
+            plt.figure(figsize=(10, 6))
+            
+            # 按类别着色点
+            colors = ['blue', 'green', 'red', 'purple']
+            markers = ['o', 's', '^', 'd']
+            categories = sorted(np.unique(y_category_test_rain))
+            
+            for i, category in enumerate(categories):
+                if category > 0 and i < len(colors):
+                    cat_mask = (y_category_test_rain == category)
+                    cat_name = "Light" if category == 1 else "Medium" if category == 2 else "Heavy" if category == 3 else "Extreme"
+                    plt.scatter(y_test_rain[cat_mask], y_pred_rain[cat_mask], 
+                               color=colors[i], marker=markers[i], alpha=0.7, label=f"{cat_name} Rain")
+            
+            # 添加对角线
+            max_val = max(max(y_test_rain), max(y_pred_rain))
+            plt.plot([0, max_val], [0, max_val], 'k--')
+            
+            plt.xlabel('Actual Rainfall (mm)')
+            plt.ylabel('Predicted Rainfall (mm)')
+            plt.title('Actual vs Predicted Values (Stratified Regressors)')
+            plt.legend()
+            plt.grid(True)
+            plt.tight_layout()
+            plt.savefig(f"{self.output_dir}/stratified_regressors_performance.png")
+            plt.close()
+        except Exception as e:
+            print(f"绘制分层回归性能图时出错: {str(e)}")
+        
+        return {
+            'rmse': rmse,
+            'mae': mae,
+            'r2': r2,
+            'predictions': y_pred_rain
+        }
+    
+    def evaluate_regressor(self, X_test, y_test, mask_true, mask_pred):
+        """评估回归器性能"""
+        if self.regressor is None:
+            print("回归器未训练，无法评估")
+            return None
+            
+        print("\n评估单一回归器性能...")
+        
+        # 所有实际降雨样本的性能
+        X_test_true_rain = X_test[mask_true]
+        y_test_true_rain = y_test[mask_true]
+        
+        if len(X_test_true_rain) > 0:
+            try:
+                # 降雨预测
+                if hasattr(self, 'log_transform') and self.log_transform:
+                    y_pred_transformed = self.regressor.predict(X_test_true_rain)
+                    y_pred_true_rain = np.expm1(y_pred_transformed)
+                else:
+                    y_pred_true_rain = self.regressor.predict(X_test_true_rain)
+                
+                # 防止负值
+                y_pred_true_rain = np.maximum(y_pred_true_rain, 0)
+                
+                # 计算评估指标
+                rmse = np.sqrt(mean_squared_error(y_test_true_rain, y_pred_true_rain))
+                mae = mean_absolute_error(y_test_true_rain, y_pred_true_rain)
+                r2 = r2_score(y_test_true_rain, y_pred_true_rain)
+                
+                print("\n所有实际降雨样本的性能：")
+                print(f"样本数: {len(X_test_true_rain)}")
+                print(f"RMSE: {rmse:.4f}")
+                print(f"MAE: {mae:.4f}")
+                print(f"R²: {r2:.4f}")
+                
+                # 绘制实际值与预测值散点图
+                plt.figure(figsize=(10, 6))
+                plt.scatter(y_test_true_rain, y_pred_true_rain, alpha=0.5)
+                
+                # 添加对角线
+                max_val = max(max(y_test_true_rain), max(y_pred_true_rain))
+                plt.plot([0, max_val], [0, max_val], 'r--')
+                
+                plt.xlabel('Actual Rainfall (mm)')
+                plt.ylabel('Predicted Rainfall (mm)')
+                plt.title('Actual vs Predicted Values (All Actual Rain Samples)')
+                plt.grid(True)
+                plt.tight_layout()
+                plt.savefig(f"{self.output_dir}/actual_vs_predicted_all_rain.png")
+                plt.close()
+            except Exception as e:
+                print(f"评估所有实际降雨样本时出错: {str(e)}")
+        
+        # 评估分类器和回归器都正确预测的样本
+        mask_both_correct = mask_true & mask_pred
+        X_test_both = X_test[mask_both_correct]
+        y_test_both = y_test[mask_both_correct]
+        
+        if len(X_test_both) > 0:
+            try:
+                # 降雨预测
+                if hasattr(self, 'log_transform') and self.log_transform:
+                    y_pred_transformed = self.regressor.predict(X_test_both)
+                    y_pred_both = np.expm1(y_pred_transformed)
+                else:
+                    y_pred_both = self.regressor.predict(X_test_both)
+                
+                # 防止负值
+                y_pred_both = np.maximum(y_pred_both, 0)
+                
+                # 计算评估指标
+                rmse = np.sqrt(mean_squared_error(y_test_both, y_pred_both))
+                mae = mean_absolute_error(y_test_both, y_pred_both)
+                r2 = r2_score(y_test_both, y_pred_both)
+                
+                print("\n正确分类的降雨样本性能：")
+                print(f"样本数: {len(X_test_both)}")
+                print(f"RMSE: {rmse:.4f}")
+                print(f"MAE: {mae:.4f}")
+                print(f"R²: {r2:.4f}")
+                
+                # 绘制实际值与预测值散点图
+                plt.figure(figsize=(10, 6))
+                plt.scatter(y_test_both, y_pred_both, alpha=0.5)
+                
+                # 添加对角线
+                max_val = max(max(y_test_both), max(y_pred_both))
+                plt.plot([0, max_val], [0, max_val], 'r--')
+                
+                plt.xlabel('Actual Rainfall (mm)')
+                plt.ylabel('Predicted Rainfall (mm)')
+                plt.title('Actual vs Predicted Values (Correctly Classified Samples)')
+                plt.grid(True)
+                plt.tight_layout()
+                plt.savefig(f"{self.output_dir}/actual_vs_predicted_correct_classified.png")
+                plt.close()
+                
+                return {
+                    'rmse': rmse,
+                    'mae': mae,
+                    'r2': r2,
+                    'predictions': y_pred_both
+                }
+            except Exception as e:
+                print(f"评估正确分类的降雨样本时出错: {str(e)}")
+                return None
+        else:
+            print("没有正确分类的降雨样本，无法评估回归性能")
+            return None
+    
+    def train_and_evaluate(self, data, n_features=30, time_based_split=True):
+        """完整的训练和评估过程"""
+        try:
+            # 步骤1: 数据预处理
+            processed_data = self.preprocess_data(data)
+            
+            # 步骤2: 特征选择
+            X, y_binary, y_amount, y_category = self.select_features(processed_data, n_features=n_features)
+            
+            # 步骤3: 数据分割
+            X_train, X_test, y_binary_train, y_binary_test, y_amount_train, y_amount_test, y_category_train, y_category_test = \
+                self.split_data(X, y_binary, y_amount, y_category, time_based=time_based_split)
+            
+            # 步骤4: 特征标准化
+            X_train_scaled, X_test_scaled = self.scale_features(X_train, X_test)
+            
+            # 步骤5: 训练二分类器（是否下雨）
+            # 使用测试集的前一半进行概率校准
+            val_size = len(X_test_scaled) // 2
+            X_val, y_val = X_test_scaled[:val_size], y_binary_test[:val_size]
+            X_test_final, y_test_final = X_test_scaled[val_size:], y_binary_test[val_size:]
+            y_amount_test_final = y_amount_test[val_size:]
+            y_category_test_final = y_category_test[val_size:] if y_category_test is not None else None
+            
+            self.train_binary_classifier(X_train_scaled, y_binary_train, X_val, y_val)
+            
+            # 步骤6: 优化分类阈值
+            self.optimize_threshold(X_test_final, y_test_final)
+            
+            # 步骤7: 评估二分类器性能
+            clf_results = self.evaluate_binary_classifier(X_test_final, y_test_final)
+            
+            # 步骤8: 训练降雨类别分类器
+            rain_mask_train = y_binary_train == 1
+            if self.stratified_regression and y_category_train is not None:
+                self.train_category_classifier(X_train_scaled, y_category_train, rain_mask_train)
+            
+            # 步骤9: 训练针对不同降雨强度的分层回归器
+            if self.stratified_regression and y_category_train is not None:
+                self.train_stratified_regressors(X_train_scaled, y_amount_train, y_category_train, rain_mask_train)
+            
+            # 步骤10: 同时训练单一回归器作为备选
+            self.train_regressor(X_train_scaled, y_amount_train, rain_mask_train)
+            
+            # 步骤11: 评估回归性能
+            mask_true = y_test_final == 1  # 实际有雨
+            mask_pred = clf_results['predictions'] == 1  # 预测有雨
+            
+            # 如果有分层回归器，评估其性能
+            strat_reg_results = None
+            if self.stratified_regression and y_category_test_final is not None and self.regressors:
+                strat_reg_results = self.evaluate_stratified_regressors(
+                    X_test_final, y_amount_test_final, y_category_test_final, mask_true, mask_pred)
+                
+                # 同时评估类别分类器
+                if self.rain_category_classifier is not None:
+                    self.evaluate_category_classifier(X_test_final, y_category_test_final, mask_true, mask_pred)
+            
+            # 评估单一回归器
+            reg_results = self.evaluate_regressor(X_test_final, y_amount_test_final, mask_true, mask_pred)
+            
+            # 步骤12: 综合评估
+            print("\n两阶段降雨预测系统评估总结：")
+            print(f"二分类器 ({self.clf_model_type}) F1分数: {clf_results['f1']:.4f}")
+            
+            if strat_reg_results:
+                print(f"分层回归器 RMSE: {strat_reg_results['rmse']:.4f}, R²: {strat_reg_results['r2']:.4f}")
+            
+            if reg_results:
+                print(f"单一回归器 ({self.reg_model_type}) RMSE: {reg_results['rmse']:.4f}, R²: {reg_results['r2']:.4f}")
+            
+            return {
+                'classifier_results': clf_results,
+                'regressor_results': reg_results,
+                'stratified_regressor_results': strat_reg_results
+            }
+        except Exception as e:
+            print(f"训练和评估过程中出错: {str(e)}")
+            traceback.print_exc()
+            return None
+    
+    def predict(self, new_data, preprocess=True):
+        """
+        使用训练好的模型进行预测
+        
+        参数:
+            new_data: DataFrame或numpy数组，输入数据
+            preprocess: bool，是否预处理输入数据
+            
+        返回:
+            dict: 包含降雨概率、二值分类结果和降雨量预测的字典
+        """
+        if self.binary_classifier is None:
+            raise ValueError("模型未训练，请先调用train_and_evaluate方法")
+        
+        # 如果需要预处理输入数据
+        if preprocess:
+            # 执行与训练期间相同的预处理
+            processed_data = self.preprocess_data(new_data, silent=True)
+            
+            # 使用特征选择，但不更新feature_names
+            X, _, _, _ = self.select_features(processed_data, silent=True)
+            
+            # 确保只使用训练过的特征（解决维度不匹配问题）
+            if isinstance(X, pd.DataFrame) and self.feature_names is not None:
+                # 检查是否所有必需的特征都存在
+                missing_features = [f for f in self.feature_names if f not in X.columns]
+                if missing_features:
+                    print(f"警告: 预处理数据中缺少以下特征: {missing_features}")
+                    # 为缺失特征填充0
+                    for f in missing_features:
+                        X[f] = 0
+                
+                # 只选择训练中使用的特征
+                X = X[self.feature_names]
+        else:
+            # 直接使用输入数据，假设已经预处理
+            if isinstance(new_data, pd.DataFrame):
+                # 检查是否所有必需的特征都存在
+                if self.feature_names is not None:
+                    missing_features = [f for f in self.feature_names if f not in new_data.columns]
+                    if missing_features:
+                        print(f"警告: 输入数据中缺少以下特征: {missing_features}")
+                        # 为缺失特征填充0
+                        for f in missing_features:
+                            new_data[f] = 0
+                    
+                    # 只选择训练中使用的特征
+                    X = new_data[self.feature_names]
+                else:
+                    X = new_data
+            else:
+                # 假设输入是按顺序排列的特征数组
+                X = new_data
+        
+        # 应用特征标准化
+        if isinstance(X, pd.DataFrame):
+            X_np = X.values
+        else:
+            X_np = X
+            
+        if self.scaler is None:
+            raise ValueError("缩放器未初始化，请先训练模型")
+            
+        X_scaled = self.scaler.transform(X_np)
+        
+        # 第一阶段: 二分类预测是否会下雨
+        try:
+            rain_probabilities = self.binary_classifier.predict_proba(X_scaled)[:, 1]
+            rain_binary = (rain_probabilities >= self.threshold).astype(int)
+        except Exception as e:
+            print(f"二分类预测时出错: {str(e)}")
+            # 使用一个安全的默认值
+            rain_probabilities = np.zeros(len(X))
+            rain_binary = np.zeros(len(X), dtype=int)
+        
+        # 第二阶段: 对预测有雨的样本，预测降雨量
+        rainfall_amount = np.zeros(len(X))
+        rain_categories = np.zeros(len(X), dtype=int)
+        rain_indices = np.where(rain_binary == 1)[0]
+        
+        if len(rain_indices) > 0:
+            # 如果有分层回归，先预测降雨类别
+            if self.stratified_regression and self.regressors and self.rain_category_classifier is not None:
+                try:
+                    # 获取降雨样本
+                    X_rain_scaled = X_scaled[rain_indices]
+                    
+                    # 预测降雨类别
+                    predicted_categories = self.rain_category_classifier.predict(X_rain_scaled)
+                    
+                    # 填充类别预测结果
+                    for i, idx in enumerate(rain_indices):
+                        rain_categories[idx] = predicted_categories[i]
+                    
+                    # 对每个雨样本，根据预测类别使用适当的回归器
+                    for i, (idx, category, prob) in enumerate(zip(rain_indices, predicted_categories, rain_probabilities[rain_indices])):
+                        reg_name = "light" if category == 1 else "medium" if category == 2 else "heavy" if category == 3 else "extreme"
+                        
+                        # 检查是否有该类别的回归器
+                        if reg_name in self.regressors:
+                            regressor = self.regressors[reg_name]
+                        else:
+                            # 回退到单一回归器
+                            regressor = self.regressor
+                        
+                        if regressor is not None:
+                            # 进行预测
+                            x = X_scaled[idx].reshape(1, -1)
+                            try:
+                                if self.log_transform:
+                                    pred_log = regressor.predict(x)[0]
+                                    pred = np.expm1(pred_log)
+                                else:
+                                    pred = regressor.predict(x)[0]
+                                
+                                # 确保预测值非负
+                                pred = max(0, pred)
+                                
+                                # 对大雨和暴雨类别应用安全系数
+                                if category == 3:  # 大雨
+                                    pred = pred * self.heavy_rain_factor
+                                elif category == 4:  # 暴雨
+                                    pred = pred * self.extreme_rain_factor
+                                    
+                                # 对于高概率样本，进一步增加预测量
+                                if prob > 0.7 and category >= 3:  # 高概率大雨或暴雨
+                                    pred = pred * 1.5  # 额外的高概率因子
+                                    
+                                rainfall_amount[idx] = pred
+                            except Exception as e:
+                                print(f"预测降雨量时出错: {str(e)}")
+                except Exception as e:
+                    print(f"分层回归预测时出错: {str(e)}")
+                    # 如果分层回归失败，尝试使用单一回归器
+                    if self.regressor is not None:
+                        try:
+                            X_rain = X_scaled[rain_indices]
+                            if self.log_transform:
+                                rain_predictions_transformed = self.regressor.predict(X_rain)
+                                rain_predictions = np.expm1(rain_predictions_transformed)
+                            else:
+                                rain_predictions = self.regressor.predict(X_rain)
+                            rainfall_amount[rain_indices] = np.maximum(rain_predictions, 0)
+                        except Exception as e2:
+                            print(f"单一回归器预测也失败: {str(e2)}")
+            
+            # 如果没有分层回归或失败，使用单一回归器
+            elif self.regressor is not None:
+                try:
+                    X_rain = X_scaled[rain_indices]
+                    
+                    # 预测降雨量
+                    if hasattr(self, 'log_transform') and self.log_transform:
+                        rain_predictions_transformed = self.regressor.predict(X_rain)
+                        rain_predictions = np.expm1(rain_predictions_transformed)
+                    else:
+                        rain_predictions = self.regressor.predict(X_rain)
+                    
+                    # 确保降雨量非负
+                    rain_predictions = np.maximum(rain_predictions, 0)
+                    
+                    # 填充预测结果
+                    rainfall_amount[rain_indices] = rain_predictions
+                except Exception as e:
+                    print(f"单一回归器预测时出错: {str(e)}")
+        
+        return {
+            'rain_probability': rain_probabilities,  # 降雨概率
+            'rain_binary': rain_binary,  # 是否会下雨（二值分类结果）
+            'rain_category': rain_categories,  # 降雨类别（1:小雨, 2:中雨, 3:大雨, 4:暴雨）
+            'rainfall_amount': rainfall_amount  # 预测的降雨量（毫米）
+        }
+    
+    def save_model(self, filename='rainfall_two_stage_model.joblib'):
+        """保存训练好的模型"""
+        if self.binary_classifier is None:
+            raise ValueError("模型未训练，无法保存")
+            
+        model_data = {
+            'binary_classifier': self.binary_classifier,
+            'regressor': self.regressor,
+            'regressors': self.regressors,
+            'rain_category_classifier': self.rain_category_classifier,
+            'scaler': self.scaler,
+            'threshold': self.threshold,
+            'feature_names': self.feature_names,
+            'feature_importances_clf': self.feature_importances_clf,
+            'feature_importances_reg': self.feature_importances_reg,
+            'clf_model_type': self.clf_model_type,
+            'reg_model_type': self.reg_model_type,
+            'log_transform': getattr(self, 'log_transform', False),
+            'feature_stats': self.feature_stats,
+            'stratified_regression': self.stratified_regression,
+            'rain_thresholds': self.rain_thresholds,
+            'heavy_rain_factor': self.heavy_rain_factor,
+            'extreme_rain_factor': self.extreme_rain_factor
+        }
+        
+        joblib.dump(model_data, filename)
+        print(f"模型已保存到 {filename}")
+        
+    @classmethod
+    def load_model(cls, filename='rainfall_two_stage_model.joblib'):
+        """加载保存的模型"""
+        model_data = joblib.load(filename)
+        
+        # 创建新实例
+        instance = cls(
+            debug=False, 
+            clf_model_type=model_data.get('clf_model_type', 'ensemble'),
+            reg_model_type=model_data.get('reg_model_type', 'gbdt')
+        )
+        
+        # 加载模型组件
+        instance.binary_classifier = model_data['binary_classifier']
+        instance.regressor = model_data.get('regressor')
+        instance.regressors = model_data.get('regressors', {})
+        instance.rain_category_classifier = model_data.get('rain_category_classifier')
+        instance.scaler = model_data['scaler']
+        instance.threshold = model_data['threshold']
+        instance.feature_names = model_data['feature_names']
+        instance.feature_importances_clf = model_data.get('feature_importances_clf')
+        instance.feature_importances_reg = model_data.get('feature_importances_reg')
+        
+        # 加载其他设置
+        if 'log_transform' in model_data:
+            instance.log_transform = model_data['log_transform']
+        
+        # 加载特征统计信息
+        if 'feature_stats' in model_data:
+            instance.feature_stats = model_data['feature_stats']
+            
+        # 加载分层回归设置
+        if 'stratified_regression' in model_data:
+            instance.stratified_regression = model_data['stratified_regression']
+            
+        if 'rain_thresholds' in model_data:
+            instance.rain_thresholds = model_data['rain_thresholds']
+            
+        if 'heavy_rain_factor' in model_data:
+            instance.heavy_rain_factor = model_data['heavy_rain_factor']
+            
+        if 'extreme_rain_factor' in model_data:
+            instance.extreme_rain_factor = model_data['extreme_rain_factor']
+        
+        return instance
+
+
+class FutureRainfallForecastSystem:
+    """
+    未来多时间点降雨预报系统
+    基于RainfallTwoStageModel，预测未来1小时、2小时、3小时的降雨情况
+    """
+    def __init__(self, model_path=None, debug=False):
+        # 存储不同时间点的预测模型
+        self.forecast_models = {}
+        self.time_points = [1, 2, 3]  # 未来1小时、2小时、3小时
+        self.debug = debug
+        
+        # 如果提供了模型路径，尝试加载模型
+        if model_path:
+            self.load_models(model_path)
+    
+    def train_forecast_models(self, data, features_config=None):
+        """
+        为每个时间点训练一个单独的预测模型
+        
+        参数:
+            data: DataFrame 训练数据，应包含时间特征和未来时间点的降雨数据
+            features_config: dict 特征配置（可选）
+        """
+        print("开始训练未来降雨预报模型...")
+        
+        # 检查数据格式
+        if not isinstance(data, pd.DataFrame):
+            try:
+                data = pd.DataFrame(data)
+            except:
+                raise TypeError("数据必须是可转换为DataFrame的格式")
+        
+        # 打印调试信息
+        print(f"原始数据形状: {data.shape}")
+        print(f"原始数据列: {data.columns.tolist()}")
+        print(f"原始数据rain列类型: {data['rain'].dtype}")
+
+        # 确保datetime列是datetime格式
+        if 'datetime' in data.columns:
+            data['datetime'] = pd.to_datetime(data['datetime'])
+        
+        # 检查数据中是否有降雨信息
+        if 'rain' not in data.columns:
+            raise ValueError("数据必须包含降雨量列 (rain)")
+        
+        # 打印数据概要
+        print(f"\n数据概要:")
+        print(f"行数: {len(data)}")
+        print(f"列数: {len(data.columns)}")
+        print(f"降雨量列: rain")
+        print(f"有雨样本数: {sum(data['rain'] > 0)} ({sum(data['rain'] > 0)/len(data)*100:.1f}%)")
+        
+        # 默认特征配置
+        if features_config is None:
+            features_config = {
+                'n_features': 30,  # 每个模型使用的特征数
+                'clf_model_type': 'gbdt',  # 分类器类型
+                'reg_model_type': 'gbdt',  # 回归器类型
+                'time_based_split': True   # 使用时间序列分割
+            }
+        
+        # 为每个未来时间点准备数据并训练模型
+        for time_point in self.time_points:
+            print(f"\n准备未来{time_point}小时降雨预报模型...")
+            
+            try:
+                # 准备数据：移动目标变量以表示未来降雨
+                # 例如，如果time_point=1，则当前行的特征对应下一行的降雨情况
+                future_data = self._prepare_future_data(data, time_point)
+                
+                if len(future_data) < 100:
+                    print(f"警告: 未来{time_point}小时的有效数据样本过少 ({len(future_data)})")
+                    continue
+                    
+                # 创建针对此时间点的模型
+                model = RainfallTwoStageModel(
+                    debug=self.debug,
+                    clf_model_type=features_config['clf_model_type'],
+                    reg_model_type=features_config['reg_model_type'],
+                    output_dir=f'rainfall_forecast_{time_point}h_figures'
+                )
+                
+                # 训练模型
+                results = model.train_and_evaluate(
+                    future_data,
+                    n_features=features_config['n_features'],
+                    time_based_split=features_config['time_based_split']
+                )
+                
+                if results is None:
+                    print(f"未来{time_point}小时模型训练失败，跳过...")
+                    continue
+                
+                # 保存训练好的模型
+                model.save_model(f'rainfall_forecast_{time_point}h_model.joblib')
+                
+                # 将模型添加到字典中
+                self.forecast_models[time_point] = model
+                
+                print(f"未来{time_point}小时降雨预报模型训练完成！")
+            except Exception as e:
+                print(f"训练未来{time_point}小时模型时出错: {str(e)}")
+                traceback.print_exc()
+        
+        if self.forecast_models:
+            print("\n所有时间点的降雨预报模型训练完成！")
+        else:
+            print("\n警告: 所有模型训练都失败了！")
+    
+    def _prepare_future_data(self, data, time_point):
+        """
+        准备用于未来时间点预测的数据
+        将目标变量滞后处理，使当前特征与未来降雨对齐
+        """
+        df = data.copy()
+
+        # 确保索引是连续的
+        df = df.reset_index(drop=True)
+        
+        # 确保时间列是正确的格式
+        if 'datetime' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+            try:
+                df['datetime'] = pd.to_datetime(df['datetime'])
+            except:
+                print(f"警告：无法转换时间列 datetime")
+        
+        # 创建未来降雨量和降雨二值列
+        # 采用移位操作：当前行的特征对应未来行的降雨
+        df[f'future_{time_point}h_rain'] = df['rain'].shift(-time_point)
+        df[f'future_{time_point}h_rain_binary'] = (df[f'future_{time_point}h_rain'] > 0).astype(int)
+        
+        # 删除含有NaN的行（通常是最后几行，因为它们没有对应的未来降雨数据）
+        df = df.dropna(subset=[f'future_{time_point}h_rain'])
+        
+        # 修改后 - 如果已存在rain列，先删除原来的rain列
+        if 'rain' in df.columns:
+            df = df.drop('rain', axis=1)  # 删除原始rain列
+        if 'rain_binary' in df.columns:
+            df = df.drop('rain_binary', axis=1)  # 删除原始rain_binary列（如果有）
+
+        # 将未来降雨列重命名为标准名称，以便模型处理
+        df = df.rename(columns={
+            f'future_{time_point}h_rain': 'rain',
+            f'future_{time_point}h_rain_binary': 'rain_binary'
+        })
+        
+        return df
+    
+    def forecast(self, current_data):
+        """
+        预测未来多个时间点的降雨情况
+        
+        参数:
+            current_data: DataFrame 当前气象数据
+            
+        返回:
+            dict: 包含各时间点降雨预测结果的字典
+        """
+        if not self.forecast_models:
+            raise ValueError("模型未训练，请先调用train_forecast_models方法")
+        
+        forecast_results = {}
+        
+        # 对每个时间点进行预测
+        for time_point, model in self.forecast_models.items():
+            print(f"\n预测未来{time_point}小时降雨情况...")
+            
+            # 使用当前模型预测未来降雨
+            try:
+                prediction = model.predict(current_data)
+                forecast_results[time_point] = prediction
+                
+                # 显示摘要结果
+                rain_prob = prediction['rain_probability'].mean()
+                rain_samples = sum(prediction['rain_binary'])
+                
+                print(f"未来{time_point}小时降雨概率: {rain_prob:.4f}")
+                print(f"预测有雨样本数: {rain_samples}/{len(current_data)} ({rain_samples/len(current_data)*100:.2f}%)")
+                
+                # 如果有降雨预测，显示降雨量情况
+                if rain_samples > 0:
+                    rain_amounts = prediction['rainfall_amount'][prediction['rain_binary'] == 1]
+                    if len(rain_amounts)>0:
+                        
+                        # 对降雨量进行排序
+                        sorted_rain = np.sort(rain_amounts)
+
+                        # 计算截取中间60%的索引位置
+                        n = len(sorted_rain)
+                        start_idx = int(n * 0.2)  # 跳过最低的20%
+                        end_idx = int(n * 0.8)    # 跳过最高的20%
+
+                        # 确保至少有一个元素
+                        if start_idx == end_idx:
+                            middle_rain = sorted_rain[start_idx:start_idx+1]
+                        else:
+                            middle_rain = sorted_rain[start_idx:end_idx]
+
+                        # 计算中间60%的平均值
+                        middle_avg = np.mean(middle_rain)
+
+                    print(f"预测降雨量: 中间60%平均 {middle_avg:.2f}mm, 全部平均 {rain_amounts.mean():.2f}mm, 最大 {rain_amounts.max():.2f}mm")
+                    
+                    # 统计降雨类别
+                    categories = prediction['rain_category'][prediction['rain_binary'] == 1]
+                    if len(categories) > 0:
+                        try:
+                            cat_counts = np.bincount(categories.astype(int))
+                            cat_names = ["", "小雨", "中雨", "大雨", "暴雨"]
+                            for cat, count in enumerate(cat_counts):
+                                if cat > 0 and count > 0 and cat < len(cat_names):
+                                    print(f"  {cat_names[cat]}: {count}个样本 ({count/rain_samples*100:.1f}%)")
+                        except Exception as e:
+                            print(f"统计降雨类别时出错: {str(e)}")
+            
+            except Exception as e:
+                print(f"未来{time_point}小时预测出错: {str(e)}")
+                forecast_results[time_point] = None
+        
+        return forecast_results
+    
+    def load_models(self, base_path):
+            """加载所有时间点的预测模型"""
+            models_loaded = 0
+            
+            try:
+                for time_point in self.time_points:
+                    model_path = f'{base_path}/rainfall_forecast_{time_point}h_model.joblib'
+                    if os.path.exists(model_path):
+                        print(f"加载未来{time_point}小时降雨预报模型...")
+                        try:
+                            model = RainfallTwoStageModel.load_model(model_path)
+                            self.forecast_models[time_point] = model
+                            models_loaded += 1
+                        except Exception as e:
+                            print(f"加载未来{time_point}小时模型时出错: {str(e)}")
+                    else:
+                        print(f"未找到未来{time_point}小时的模型文件: {model_path}")
+                
+                if models_loaded > 0:
+                    print(f"成功加载 {models_loaded} 个时间点的预报模型")
+                else:
+                    print("警告: 未能加载任何预报模型")
+                    
+            except Exception as e:
+                print(f"加载模型出错: {str(e)}")
+    
+    def save_models(self, base_path):
+        """保存所有时间点的预测模型"""
+        os.makedirs(base_path, exist_ok=True)
+        
+        for time_point, model in self.forecast_models.items():
+            model_path = f'{base_path}/rainfall_forecast_{time_point}h_model.joblib'
+            print(f"保存未来{time_point}小时降雨预报模型...")
+            model.save_model(model_path)
+        
+        print(f"成功保存 {len(self.forecast_models)} 个时间点的预报模型")
+    
+    def visualize_forecast(self, forecast_results, output_dir='forecast_visualizations'):
+        """
+        可视化预测结果
+        
+        参数:
+            forecast_results: dict 各时间点的预测结果
+            output_dir: str 输出目录
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        
+        if not forecast_results:
+            print("没有预测结果可供可视化")
+            return
+        
+        # 准备绘图数据
+        time_labels = [f"{t}hour" for t in forecast_results.keys()]
+        rain_probs = [forecast_results[t]['rain_probability'].mean() for t in forecast_results.keys()]
+        
+        # 准备降雨量数据
+        rain_amounts = []
+        for t in forecast_results.keys():
+            result = forecast_results[t]
+            if sum(result['rain_binary']) > 0:
+                # 计算有雨样本的平均降雨量
+                amounts = result['rainfall_amount'][result['rain_binary'] == 1]
+                rain_amounts.append(amounts.mean())
+            else:
+                rain_amounts.append(0)
+        
+        # 绘制降雨概率随时间变化图
+        plt.figure(figsize=(10, 6))
+        plt.bar(time_labels, rain_probs, color='skyblue')
+        plt.axhline(y=0.5, color='r', linestyle='--', label='50%概率阈值')
+        plt.ylim(0, 1)
+        plt.title('Prediction of rainfall probability at various time points in the future')
+        plt.ylabel('Rainfall probability')
+        plt.xlabel('pred_time')
+        plt.grid(True, alpha=0.3)
+        plt.legend()
+        plt.tight_layout()
+        plt.savefig(f"{output_dir}/rainfall_probability_forecast.png")
+        plt.close()
+        
+        # 绘制降雨量随时间变化图
+        plt.figure(figsize=(10, 6))
+        plt.bar(time_labels, rain_amounts, color='blue')
+        plt.title('Prediction of rainfall at different time points in the future')
+        plt.ylabel('rain (mm)')
+        plt.xlabel('pred_time')
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(f"{output_dir}/rainfall_amount_forecast.png")
+        plt.close()
+
+def get_current_weather_data():
+    """
+    模拟从气象站获取当前的气象数据
+    在实际应用中，这个函数会从传感器、API或其他数据源获取数据
+    """
+    # 模拟数据 - 在实际应用中会被真实数据替代
+    current_data = {
+        'datetime': [datetime.datetime.now()],
+        'PWV': [25.5],          # 可降水量
+        'ZTD': [2450.3],        # 对流层延迟
+        'Temp': [305.6],        # 温度（开尔文）
+        'Press': [1008.5],      # 气压（hPa）
+        'rh': [78.5],           # 相对湿度（%）
+        'ws': [3.2]             # 风速（m/s）
+    }
+    
+    # 创建DataFrame
+    current_df = pd.DataFrame(current_data)
+    
+    # 加载历史数据以计算变化率和滚动统计
+    try:
+        history_df = pd.read_csv('weather_history_cache.csv')
+        history_df['datetime'] = pd.to_datetime(history_df['datetime'])
+        
+        # 合并当前数据与历史数据
+        combined_df = pd.concat([history_df, current_df]).reset_index(drop=True)
+        
+        # 确保按时间排序
+        combined_df = combined_df.sort_values(by='datetime')
+        
+        # 保存最新的24小时数据作为历史缓存
+        combined_df.tail(24).to_csv('weather_history_cache.csv', index=False)
+        
+        # 计算变化率
+        for var in ['PWV', 'ZTD', 'Temp', 'Press', 'rh']:
+            # 1小时变化（假设数据是每小时一条）
+            combined_df[f'{var}_1h_change'] = combined_df[var].diff(1)
+            
+            # 3小时变化
+            combined_df[f'{var}_3h_change'] = combined_df[var].diff(3)
+            
+            # 6小时变化
+            combined_df[f'{var}_6h_change'] = combined_df[var].diff(6)
+            
+            # 计算滚动均值
+            combined_df[f'{var}_3h_mean'] = combined_df[var].rolling(window=3, min_periods=1).mean()
+            combined_df[f'{var}_6h_mean'] = combined_df[var].rolling(window=6, min_periods=1).mean()
+            
+            # 滚动标准差
+            combined_df[f'{var}_6h_std'] = combined_df[var].rolling(window=6, min_periods=1).std().fillna(0)
+            
+            # 异常值（相对于平均值的标准化偏差）
+            var_mean = combined_df[var].mean()
+            var_std = combined_df[var].std()
+            combined_df[f'{var}_anomaly'] = (combined_df[var] - var_mean) / var_std if var_std > 0 else 0
+            
+            # 急剧变化指标
+            var_std = combined_df[f'{var}_1h_change'].std()
+            if var_std > 0:
+                combined_df[f'{var}_rapid_change'] = (np.abs(combined_df[f'{var}_1h_change']) > (2 * var_std)).astype(int)
+            else:
+                combined_df[f'{var}_rapid_change'] = 0
+            
+            # 变化加速度（变化率的变化率）
+            combined_df[f'{var}_change_accel'] = combined_df[f'{var}_1h_change'].diff(1).fillna(0)
+        
+        # 添加组合特征
+        if all(var in combined_df.columns for var in ['PWV', 'rh']):
+            # 湿度可用性指数
+            combined_df['PWV_rh'] = combined_df['PWV'] * combined_df['rh'] / 100
+            
+            # 累积湿度变化
+            combined_df['PWV_cumul_change'] = combined_df['PWV'].diff(1).rolling(window=6).sum().fillna(0)
+            combined_df['rh_cumul_change'] = combined_df['rh'].diff(1).rolling(window=6).sum().fillna(0)
+        
+        if all(var in combined_df.columns for var in ['PWV', 'Temp']):
+            # 温度-湿度相互作用
+            combined_df['PWV_Temp'] = combined_df['PWV'] * combined_df['Temp']
+            
+            # 饱和潜力
+            temp_exp = np.exp(17.27 * (combined_df['Temp']-273.15) / ((combined_df['Temp']-273.15) + 237.3))
+            combined_df['saturation_index'] = combined_df['PWV'] / (0.1 * temp_exp)
+        
+        if all(var in combined_df.columns for var in ['Press', 'Press_1h_change']):
+            # 气压变化强度
+            combined_df['Press_change_intensity'] = combined_df['Press_1h_change'].abs()
+            
+            # 气压下降指标
+            combined_df['Press_falling'] = (combined_df['Press_1h_change'] < 0).astype(int)
+            
+            # 气压快速下降指标
+            press_quantile = combined_df['Press_1h_change'].abs().quantile(0.75)
+            combined_df['Press_rapid_fall'] = ((combined_df['Press_1h_change'] < 0) & 
+                                              (combined_df['Press_1h_change'].abs() > press_quantile)).astype(int)
+        
+        if all(var in combined_df.columns for var in ['ZTD', 'PWV']):
+            # ZTD-PWV比率
+            combined_df['ZTD_PWV_ratio'] = combined_df['ZTD'] / combined_df['PWV'].replace(0, np.nan).fillna(combined_df['PWV'].mean())
+            
+            # 如果有变化率特征
+            if all(var in combined_df.columns for var in ['ZTD_1h_change', 'PWV_1h_change']):
+                combined_df['ZTD_PWV_change'] = combined_df['ZTD_1h_change'] * combined_df['PWV_1h_change']
+            
+            # 湿度不稳定指标
+            pwv_mean = combined_df['PWV_6h_mean'].mean()
+            if pwv_mean > 0:
+                combined_df['moisture_instability'] = combined_df['PWV_6h_std'] / combined_df['PWV_6h_mean']
+            else:
+                combined_df['moisture_instability'] = 0
+            
+        # 添加降雨潜力指标
+        if all(var in combined_df.columns for var in ['PWV', 'rh', 'Temp', 'Press_1h_change']):
+            # 组合降雨指标
+            combined_df['rain_potential'] = ((combined_df['PWV'] * combined_df['rh'] / 100) * 
+                                           (0.1 - combined_df['Press_1h_change']) / 
+                                           (combined_df['Temp'] + 10))
+            
+            # 强降雨潜力指标
+            pwv_mean = combined_df['PWV'].mean() + 0.1  # 避免除以零
+            combined_df['heavy_rain_potential'] = (combined_df['PWV'] * (combined_df['rh'] / 100) * 
+                                                 (combined_df['Press_rapid_fall'] + 1) * 
+                                                 (1 + np.abs(combined_df['PWV_1h_change']) / 
+                                                  pwv_mean))
+            
+            # 降雨持续性指标
+            combined_df['rain_persistence'] = (combined_df['PWV_cumul_change'] * 
+                                             combined_df['rh_cumul_change'] * 
+                                             (1 + combined_df['Press_change_intensity']))
+        
+        # 获取最新一行作为当前数据，包含所有计算的特征
+        current_df = combined_df.iloc[-1:].reset_index(drop=True)
+        
+    except Exception as e:
+        print(f"计算特征时出错: {str(e)}")
+        print("创建一个新的历史缓存文件...")
+        
+        # 如果没有历史数据或出错，将当前数据作为历史缓存的第一条记录
+        current_df.to_csv('weather_history_cache.csv', index=False)
+        
+        # 这种情况下无法计算时间序列特征
+        print("警告: 无法计算变化率和滚动统计特征，首次运行只能使用基础特征")
+    
+    return current_df
+
+def run_realtime_rainfall_forecast():
+    """
+    运行实时降雨预报系统
+    """
+    print("启动实时降雨预报系统...")
+    
+    # 检查模型目录是否存在
+    model_dir = 'rainfall_forecast_models'
+    if not os.path.exists(model_dir):
+        print(f"错误: 未找到模型目录 '{model_dir}'")
+        print("请先使用历史数据训练模型")
+        return
+    
+    # 加载预测模型
+    print("加载未来降雨预报模型...")
+    forecast_system = FutureRainfallForecastSystem()
+    forecast_system.load_models(model_dir)
+    
+    if not forecast_system.forecast_models:
+        print("错误: 未能加载任何模型")
+        return
+    
+    # 获取当前气象数据
+    print("获取当前气象数据...")
+    current_weather = get_current_weather_data()
+    
+    # 检查数据是否完整
+    if current_weather is None or len(current_weather) == 0:
+        print("错误: 无法获取当前气象数据")
+        return
+    
+    print("\n当前气象条件:")
+    print(f"时间: {current_weather['datetime'].iloc[0]}")
+    print(f"温度: {current_weather['Temp'].iloc[0]:.1f}K")
+    print(f"湿度: {current_weather['rh'].iloc[0]:.1f}%")
+    print(f"气压: {current_weather['Press'].iloc[0]:.1f} hPa")
+    print(f"可降水量(PWV): {current_weather['PWV'].iloc[0]:.1f} mm")
+    
+    # 进行未来降雨预测
+    print("\n开始预测未来降雨情况...")
+    forecast_results = forecast_system.forecast(current_weather)
+    
+    # 可视化预测结果
+    print("\n生成降雨预报可视化...")
+    forecast_system.visualize_forecast(forecast_results, output_dir='realtime_forecast_results')
+    
+    # 输出预测报告
+    print("\n未来降雨预报摘要:")
+    print("=" * 50)
+    
+    # 当前时间
+    current_time = datetime.datetime.now()
+    print(f"预报时间: {current_time.strftime('%Y-%m-%d %H:%M')}")
+    
+    # 用于存储预报结果的列表，以便可能的后续处理或发送通知
+    forecast_summary = []
+    
+    for time_point, result in forecast_results.items():
+        # 计算预测时间
+        forecast_time = current_time + timedelta(hours=time_point)
+        
+        print(f"\n未来{time_point}小时 ({forecast_time.strftime('%H:%M')}) 降雨预报:")
+        print("-" * 50)
+        
+        # 降雨概率
+        rain_prob = result['rain_probability'].mean()
+        
+        # 是否有雨
+        rain_predicted = sum(result['rain_binary']) > 0
+        rain_status = "有" if rain_predicted else "无"
+        
+        print(f"降雨概率: {rain_prob:.2f}")
+        print(f"预计降雨: {rain_status}")
+        
+        forecast_record = {
+            'time_point': time_point,
+            'forecast_time': forecast_time,
+            'rain_probability': rain_prob,
+            'rain_predicted': rain_predicted
+        }
+        
+        # 如果预测有雨，显示详细信息
+        if rain_predicted:
+            rain_indices = np.where(result['rain_binary'] == 1)[0]
+            rain_amounts = result['rainfall_amount'][rain_indices]
+            rain_categories = result['rain_category'][rain_indices]
+            
+            # 计算平均降雨量和最大降雨量
+            avg_rain = rain_amounts.mean()
+            max_rain = rain_amounts.max()
+            
+            print(f"预计降雨量: {avg_rain:.2f}mm (平均), {max_rain:.2f}mm (最大)")
+            
+            forecast_record['avg_rainfall'] = avg_rain
+            forecast_record['max_rainfall'] = max_rain
+            
+            # 统计降雨类别
+            try:
+                cat_counts = np.bincount(rain_categories.astype(int))
+                cat_names = ["", "小雨", "中雨", "大雨", "暴雨"]
+                
+                print("降雨类型:")
+                most_common_cat = 0
+                most_common_count = 0
+                
+                for cat, count in enumerate(cat_counts):
+                    if cat > 0 and count > 0:
+                        if count > most_common_count:
+                            most_common_cat = cat
+                            most_common_count = count
+                        print(f"  {cat_names[cat]}: {count}个样本 ({count/len(rain_indices)*100:.1f}%)")
+                
+                forecast_record['rain_category'] = most_common_cat
+                forecast_record['rain_category_name'] = cat_names[most_common_cat] if most_common_cat > 0 else ""
+                
+                # 主要降雨类型
+                if most_common_cat > 0:
+                    print(f"主要降雨类型: {cat_names[most_common_cat]}")
+                
+                # 给出降雨建议
+                advice = ""
+                if most_common_cat >= 3:  # 大雨或暴雨
+                    advice = "建议避免外出，注意防范强降雨可能带来的积水和交通影响"
+                elif most_common_cat == 2:  # 中雨
+                    advice = "出门请携带雨具，道路可能湿滑，驾车注意安全"
+                else:  # 小雨
+                    advice = "带好雨具，不会对出行造成太大影响"
+                    
+                print(f"出行建议: {advice}")
+                forecast_record['advice'] = advice
+            except Exception as e:
+                print(f"处理降雨类别时出错: {str(e)}")
+                forecast_record['rain_category'] = 0
+                forecast_record['rain_category_name'] = ""
+                forecast_record['advice'] = "请注意携带雨具"
+        else:
+            forecast_record['avg_rainfall'] = 0
+            forecast_record['max_rainfall'] = 0
+            forecast_record['rain_category'] = 0
+            forecast_record['rain_category_name'] = ""
+            forecast_record['advice'] = "天气良好，适合户外活动"
+            
+        forecast_summary.append(forecast_record)
+    
+    # 保存预报结果用于历史比较和验证
+    try:
+        forecast_history = pd.DataFrame(forecast_summary)
+        forecast_time_str = current_time.strftime('%Y%m%d_%H%M')
+        forecast_history.to_csv(f'realtime_forecast_results/forecast_{forecast_time_str}.csv', index=False)
+    except Exception as e:
+        print(f"保存预报结果时出错: {str(e)}")
+    
+    print("\n预报完成.")
+    
+    # 检查是否需要发送预警
+    for record in forecast_summary:
+        if record['rain_predicted'] and record.get('rain_category', 0) >= 3:  # 大雨或暴雨
+            print(f"\n⚠️ 预警: 未来{record['time_point']}小时预计有{record['rain_category_name']}!")
+            # 这里可以添加发送邮件、短信或其他通知的代码
+    
+    return forecast_summary
+
+
+# 主程序入口
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="未来降雨预报系统 - 训练模型和实时预测")
+    parser.add_argument('--mode', type=str, default='train', choices=['train', 'predict', 'realtime'],
+                        help='执行模式: train-训练模型, predict-单次预测, realtime-实时预测')
+    parser.add_argument('--data', type=str, default='rainfall_data_time.csv',
+                        help='数据文件路径')
+    parser.add_argument('--model_dir', type=str, default='rainfall_forecast_models',
+                        help='模型保存/加载目录')
+    parser.add_argument('--n_features', type=int, default=35,
+                        help='使用的特征数量')
+    parser.add_argument('--clf_model', type=str, default='ensemble',
+                        choices=['lr', 'rf', 'gbdt', 'xgboost', 'ensemble'],
+                        help='分类器类型')
+    parser.add_argument('--reg_model', type=str, default='gbdt',
+                        choices=['rf', 'gbdt', 'ridge', 'xgboost'],
+                        help='回归器类型')
+    parser.add_argument('--time_based', action='store_true',
+                        help='是否使用基于时间的数据分割（默认使用随机分割）')
+    parser.add_argument('--debug', action='store_true',
+                        help='启用调试模式，输出更详细的信息')
+    
+    args = parser.parse_args()
+    
+    # 创建输出目录
+    os.makedirs('realtime_forecast_results', exist_ok=True)
+    
+    try:
+        if args.mode == 'train':
+            # 训练模式
+            print(f"加载训练数据: {args.data}")
+            try:
+                # 读取历史数据
+                historical_data = pd.read_csv(args.data)
+                
+                # 确保时间列是日期时间格式
+                if 'datetime' in historical_data.columns:
+                    historical_data['datetime'] = pd.to_datetime(historical_data['datetime'])
+                    
+                    # 按时间排序数据
+                    historical_data = historical_data.sort_values(by='datetime')
+                    
+                    print(f"数据时间范围: {historical_data['datetime'].min()} 到 {historical_data['datetime'].max()}")
+                else:
+                    print("警告: 数据中没有找到时间列，假设数据已按时间排序")
+                
+                print(f"总样本数: {len(historical_data)}")
+                
+                # 创建未来降雨预报系统
+                print("\n创建未来降雨预报系统...")
+                forecast_system = FutureRainfallForecastSystem(debug=args.debug)
+                
+                # 配置特征
+                features_config = {
+                    'n_features': args.n_features,
+                    'clf_model_type': args.clf_model,
+                    'reg_model_type': args.reg_model,
+                    'time_based_split': args.time_based
+                }
+                
+                # 训练未来各时间点的预报模型
+                print("\n开始训练未来降雨预报模型...")
+                forecast_system.train_forecast_models(historical_data, features_config)
+                
+                # 保存训练好的模型
+                print("\n保存训练好的模型...")
+                forecast_system.save_models(args.model_dir)
+                
+                print("\n未来降雨预报系统训练完成！")
+                
+            except FileNotFoundError:
+                print(f"错误: 找不到训练数据文件 '{args.data}'")
+                print("请确保该文件存在于当前目录中")
+                
+            except Exception as e:
+                print(f"训练过程中出错: {str(e)}")
+                traceback.print_exc()
+                
+        elif args.mode == 'predict':
+            # 单次预测模式
+            print(f"加载当前气象数据: {args.data}")
+            try:
+                # 读取当前气象数据
+                current_data = pd.read_csv(args.data)
+                
+                # 加载预测模型
+                print("\n加载未来降雨预报模型...")
+                forecast_system = FutureRainfallForecastSystem(model_path=args.model_dir, debug=args.debug)
+                
+                # 进行预测
+                print("\n开始预测未来降雨情况...")
+                forecast_results = forecast_system.forecast(current_data)
+                
+                # 可视化预测结果
+                print("\n生成预测结果可视化...")
+                forecast_system.visualize_forecast(forecast_results)
+                
+                print("\n预测完成.")
+                
+            except FileNotFoundError:
+                print(f"错误: 找不到当前气象数据文件 '{args.data}'")
+                
+            except Exception as e:
+                print(f"预测过程中出错: {str(e)}")
+                traceback.print_exc()
+                
+        elif args.mode == 'realtime':
+            # 实时预测模式
+            try:
+                run_realtime_rainfall_forecast()
+            except Exception as e:
+                print(f"实时预测过程中出错: {str(e)}")
+                traceback.print_exc()
+            
+            # 如果需要周期性运行
+            """
+            while True:
+                try:
+                    run_realtime_rainfall_forecast()
+                    print("\n等待下次预报...")
+                    # 等待1小时
+                    time.sleep(3600)
+                except KeyboardInterrupt:
+                    print("\n用户中断，系统退出")
+                    break
+                except Exception as e:
+                    print(f"\n运行时错误: {str(e)}")
+                    traceback.print_exc()
+                    # 出错后等待10分钟再重试
+                    time.sleep(600)
+            """
+    
+    except KeyboardInterrupt:
+        print("\n用户中断，系统退出")
+    except Exception as e:
+        print(f"\n系统错误: {str(e)}")
+        traceback.print_exc()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..448f4d7
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+numpy>=1.21.0
+pandas>=1.3.0
+scikit-learn>=0.24.2
+xgboost>=1.4.2
+matplotlib>=3.4.2
+seaborn>=0.11.1
+pymysql>=1.0.2
+joblib>=1.0.1
\ No newline at end of file
diff --git a/src/data/data_processor.py b/src/data/data_processor.py
new file mode 100644
index 0000000..fe95d09
--- /dev/null
+++ b/src/data/data_processor.py
@@ -0,0 +1,142 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from ..utils.feature_engineering import FeatureEngineering
+
+class DataProcessor:
+    def __init__(self, debug=True):
+        """
+        初始化数据处理器
+        """
+        self.debug = debug
+        self.scaler = StandardScaler()
+        self.feature_columns = None
+        self.target_columns = None
+
+    def debug_print(self, message):
+        """
+        调试信息打印
+        """
+        if self.debug:
+            print(message)
+
+    def preprocess_data(self, data, silent=False):
+        """
+        数据预处理
+        """
+        if not silent:
+            self.debug_print("开始数据预处理...")
+
+        # 复制数据以避免修改原始数据
+        df = data.copy()
+
+        # 确保datetime列是datetime类型
+        if 'datetime' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+            df['datetime'] = pd.to_datetime(df['datetime'])
+
+        # 处理缺失值
+        df = self._handle_missing_values(df)
+
+        # 特征工程
+        df = FeatureEngineering.process_features(df)
+
+        # 移除包含NaN的行
+        df = df.dropna()
+
+        if not silent:
+            self.debug_print(f"预处理完成，数据形状: {df.shape}")
+
+        return df
+
+    def _handle_missing_values(self, df):
+        """
+        处理缺失值
+        """
+        # 对于时间序列数据，使用前向填充
+        df = df.fillna(method='ffill')
+        
+        # 对于剩余的缺失值，使用列的中位数填充
+        df = df.fillna(df.median())
+        
+        return df
+
+    def select_features(self, data, n_features=30, silent=False):
+        """
+        特征选择
+        """
+        if not silent:
+            self.debug_print(f"选择前{n_features}个特征...")
+
+        # 计算特征重要性
+        feature_importance = self._calculate_feature_importance(data)
+        
+        # 选择最重要的特征
+        selected_features = feature_importance.nlargest(n_features).index.tolist()
+        
+        if not silent:
+            self.debug_print(f"选择的特征: {selected_features}")
+
+        return selected_features
+
+    def _calculate_feature_importance(self, data):
+        """
+        计算特征重要性
+        """
+        # 这里使用一个简单的基于相关性的特征重要性计算
+        # 在实际应用中，你可能需要使用更复杂的方法
+        correlations = data.corr()['rainfall'].abs()
+        return correlations.drop('rainfall')
+
+    def split_data(self, X, y_binary, y_amount, y_category=None, time_based=False, test_size=0.2):
+        """
+        数据分割
+        """
+        if time_based:
+            # 基于时间的分割
+            split_idx = int(len(X) * (1 - test_size))
+            X_train = X[:split_idx]
+            X_test = X[split_idx:]
+            y_train_binary = y_binary[:split_idx]
+            y_test_binary = y_binary[split_idx:]
+            y_train_amount = y_amount[:split_idx]
+            y_test_amount = y_amount[split_idx:]
+            
+            if y_category is not None:
+                y_train_category = y_category[:split_idx]
+                y_test_category = y_category[split_idx:]
+            else:
+                y_train_category = None
+                y_test_category = None
+        else:
+            # 随机分割
+            from sklearn.model_selection import train_test_split
+            X_train, X_test, y_train_binary, y_test_binary, y_train_amount, y_test_amount = train_test_split(
+                X, y_binary, y_amount, test_size=test_size, random_state=42
+            )
+            
+            if y_category is not None:
+                _, _, y_train_category, y_test_category = train_test_split(
+                    X, y_category, test_size=test_size, random_state=42
+                )
+            else:
+                y_train_category = None
+                y_test_category = None
+
+        return X_train, X_test, y_train_binary, y_test_binary, y_train_amount, y_test_amount, y_train_category, y_test_category
+
+    def scale_features(self, X_train, X_test):
+        """
+        特征缩放
+        """
+        # 保存特征列名
+        self.feature_columns = X_train.columns
+        
+        # 缩放特征
+        X_train_scaled = self.scaler.fit_transform(X_train)
+        X_test_scaled = self.scaler.transform(X_test)
+        
+        # 转换回DataFrame
+        X_train_scaled = pd.DataFrame(X_train_scaled, columns=self.feature_columns)
+        X_test_scaled = pd.DataFrame(X_test_scaled, columns=self.feature_columns)
+        
+        return X_train_scaled, X_test_scaled
\ No newline at end of file
diff --git a/src/database/db_connector.py b/src/database/db_connector.py
new file mode 100644
index 0000000..4c01fcd
--- /dev/null
+++ b/src/database/db_connector.py
@@ -0,0 +1,236 @@
+import pandas as pd
+import pymysql
+from datetime import datetime, timedelta
+
+class DatabaseConnector:
+    def __init__(self, rtk_host='localhost', rtk_user='root', rtk_password='root', rtk_database='rtk_data',
+                 rain_host='8.134.185.53', rain_user='remote', rain_password='root', rain_database='rain_db'):
+        """
+        初始化数据库连接器
+        """
+        # RTK数据库配置
+        self.rtk_host = 'localhost'
+        self.rtk_user = 'root'
+        self.rtk_password = 'root'
+        self.rtk_database = 'rtk_data'
+        self.rtk_connection = None
+
+        # 气象数据库配置
+        self.rain_host = '8.134.185.53'
+        self.rain_user = 'remote'
+        self.rain_password = 'root'
+        self.rain_database = 'rain_db'
+        self.rain_connection = None
+
+    def connect_rtk(self):
+        """
+        建立RTK数据库连接
+        """
+        try:
+            self.rtk_connection = pymysql.connect(
+                host=self.rtk_host,
+                user=self.rtk_user,
+                password=self.rtk_password,
+                database=self.rtk_database
+            )
+            return True
+        except Exception as e:
+            print(f"RTK数据库连接失败: {str(e)}")
+            return False
+
+    def connect_rain(self):
+        """
+        建立气象数据库连接
+        """
+        try:
+            self.rain_connection = pymysql.connect(
+                host=self.rain_host,
+                user=self.rain_user,
+                password=self.rain_password,
+                database=self.rain_database
+            )
+            return True
+        except Exception as e:
+            print(f"气象数据库连接失败: {str(e)}")
+            return False
+
+    def disconnect(self):
+        """
+        关闭所有数据库连接
+        """
+        if self.rtk_connection:
+            self.rtk_connection.close()
+            self.rtk_connection = None
+        if self.rain_connection:
+            self.rain_connection.close()
+            self.rain_connection = None
+
+    def get_latest_weather_data(self):
+        """
+        获取最新的气象数据，合并RTK和气象数据的最新记录
+        使用RTK数据的时间作为基准时间
+        """
+        # 获取RTK数据
+        if not self.rtk_connection:
+            if not self.connect_rtk():
+                return None
+
+        try:
+            # 获取RTK数据最新记录
+            rtk_query = """
+                SELECT 
+                    timestamp,
+                    pwv,
+                    ztd,
+                    ztd_gradient_east,
+                    ztd_gradient_north
+                FROM rtk_data
+                ORDER BY timestamp DESC
+                LIMIT 1
+            """
+            rtk_df = pd.read_sql(rtk_query, self.rtk_connection)
+            
+            if rtk_df.empty:
+                print("RTK数据库中没有数据")
+                return None
+
+            # 获取气象数据最新记录
+            if not self.rain_connection:
+                if not self.connect_rain():
+                    print("无法连接到气象数据库")
+                    return rtk_df
+
+            rain_query = """
+                SELECT 
+                    wind_speed,
+                    wind_force,
+                    wind_direction_360 as wind_direction,
+                    humidity,
+                    temperature,
+                    atm_pressure,
+                    solar_radiation,
+                    rainfall
+                FROM sensor_data
+                ORDER BY timestamp DESC
+                LIMIT 1
+            """
+            rain_df = pd.read_sql(rain_query, self.rain_connection)
+            
+            if rain_df.empty:
+                print("气象数据库中没有数据")
+                return rtk_df
+
+            # 重命名RTK数据列
+            rtk_df = rtk_df.rename(columns={
+                'timestamp': 'datetime',
+                'pwv': 'PWV',
+                'ztd': 'ZTD'
+            })
+
+            # 重命名气象数据列
+            rain_df = rain_df.rename(columns={
+                'humidity': 'rh',
+                'temperature': 'Temp',
+                'atm_pressure': 'Press',
+                'wind_speed': 'ws'
+            })
+
+            # 将RTK的时间戳转换为datetime类型
+            rtk_df['datetime'] = pd.to_datetime(rtk_df['datetime'])
+
+            # 合并数据，使用RTK的时间
+            merged_df = pd.concat([rtk_df, rain_df], axis=1)
+            
+            # 删除重复的列（如果有的话）
+            merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]
+            
+            return merged_df
+
+        except Exception as e:
+            print(f"获取数据失败: {str(e)}")
+            return None
+
+    def get_historical_data(self, start_date, end_date):
+        """
+        获取历史数据，合并RTK和气象数据
+        """
+        # 获取RTK数据
+        if not self.rtk_connection:
+            if not self.connect_rtk():
+                return None
+
+        try:
+            # 获取RTK数据
+            rtk_query = """
+                SELECT 
+                    timestamp,
+                    pwv,
+                    ztd,
+                    ztd_gradient_east,
+                    ztd_gradient_north
+                FROM rtk_data
+                WHERE timestamp BETWEEN %s AND %s
+                ORDER BY timestamp
+            """
+            rtk_df = pd.read_sql(rtk_query, self.rtk_connection, params=[start_date, end_date])
+            
+            # 重命名RTK数据列
+            rtk_df = rtk_df.rename(columns={
+                'timestamp': 'datetime',
+                'pwv': 'PWV',
+                'ztd': 'ZTD'
+            })
+
+            # 获取气象数据
+            if not self.rain_connection:
+                if not self.connect_rain():
+                    return rtk_df  # 如果气象数据库连接失败，返回RTK数据
+
+            rain_query = """
+                SELECT 
+                    timestamp,
+                    wind_speed,
+                    wind_force,
+                    wind_direction_360 as wind_direction,
+                    humidity,
+                    temperature,
+                    atm_pressure,
+                    solar_radiation,
+                    rainfall
+                FROM sensor_data
+                WHERE timestamp BETWEEN %s AND %s
+                ORDER BY timestamp
+            """
+            rain_df = pd.read_sql(rain_query, self.rain_connection, params=[start_date, end_date])
+            
+            # 重命名气象数据列
+            rain_df = rain_df.rename(columns={
+                'timestamp': 'datetime',
+                'humidity': 'rh',
+                'temperature': 'Temp',
+                'atm_pressure': 'Press',
+                'wind_speed': 'ws'
+            })
+
+            # 合并数据
+            if not rain_df.empty:
+                # 将时间戳转换为datetime类型
+                rtk_df['datetime'] = pd.to_datetime(rtk_df['datetime'])
+                rain_df['datetime'] = pd.to_datetime(rain_df['datetime'])
+                
+                # 合并数据
+                merged_df = pd.merge(rtk_df, rain_df, on='datetime', how='outer')
+                
+                # 按时间排序
+                merged_df = merged_df.sort_values('datetime')
+                
+                # 填充缺失值
+                merged_df = merged_df.fillna(method='ffill')
+                
+                return merged_df
+            else:
+                return rtk_df
+
+        except Exception as e:
+            print(f"获取历史数据失败: {str(e)}")
+            return None
\ No newline at end of file
diff --git a/src/forecast/forecast_system.py b/src/forecast/forecast_system.py
new file mode 100644
index 0000000..d230228
--- /dev/null
+++ b/src/forecast/forecast_system.py
@@ -0,0 +1,154 @@
+import os
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from ..models.rainfall_model import RainfallTwoStageModel
+from ..data.data_processor import DataProcessor
+from ..utils.feature_engineering import FeatureEngineering
+
+class FutureRainfallForecastSystem:
+    def __init__(self, model_path=None, debug=False):
+        """
+        初始化未来降雨预报系统
+        """
+        self.debug = debug
+        self.forecast_models = {}
+        self.data_processor = DataProcessor(debug=debug)
+        self.model_path = model_path
+
+    def train_forecast_models(self, data, features_config=None):
+        """
+        训练多个时间点的预测模型
+        """
+        print("开始训练未来降雨预报模型...")
+        
+        # 预处理数据
+        processed_data = self.data_processor.preprocess_data(data)
+        
+        # 为每个预测时间点训练模型
+        for time_point in [1, 3, 6, 12, 24]:  # 预测未来1, 3, 6, 12, 24小时
+            print(f"\n训练{time_point}小时预报模型...")
+            
+            # 准备训练数据
+            future_data = self._prepare_future_data(processed_data, time_point)
+            
+            # 创建和训练模型
+            model = RainfallTwoStageModel(debug=self.debug)
+            model.train_and_evaluate(future_data)
+            
+            # 保存模型
+            self.forecast_models[time_point] = model
+            
+            print(f"{time_point}小时预报模型训练完成")
+
+    def _prepare_future_data(self, data, time_point):
+        """
+        准备未来时间点的训练数据
+        """
+        # 创建时间偏移
+        future_data = data.copy()
+        future_data['datetime'] = future_data['datetime'] + timedelta(hours=time_point)
+        
+        # 合并当前数据和未来数据
+        merged_data = pd.merge(
+            data,
+            future_data[['datetime', 'rainfall']],
+            on='datetime',
+            suffixes=('', '_future')
+        )
+        
+        # 重命名目标变量
+        merged_data = merged_data.rename(columns={'rainfall_future': 'rainfall'})
+        
+        return merged_data
+
+    def forecast(self, current_data):
+        """
+        进行未来降雨预报
+        """
+        if not self.forecast_models:
+            raise ValueError("没有可用的预报模型，请先训练模型")
+
+        # 预处理当前数据
+        processed_data = self.data_processor.preprocess_data(current_data)
+        
+        # 存储预报结果
+        forecast_results = {}
+        
+        # 对每个时间点进行预报
+        for time_point, model in self.forecast_models.items():
+            # 准备预报数据
+            forecast_data = self._prepare_future_data(processed_data, time_point)
+            
+            # 进行预报
+            predictions = model.predict(forecast_data)
+            
+            # 存储结果
+            forecast_results[time_point] = predictions
+        
+        return forecast_results
+
+    def load_models(self, base_path):
+        """
+        加载预报模型
+        """
+        if not os.path.exists(base_path):
+            raise FileNotFoundError(f"模型目录不存在: {base_path}")
+        
+        for time_point in [1, 3, 6, 12, 24]:
+            model_path = os.path.join(base_path, f'model_{time_point}h.joblib')
+            if os.path.exists(model_path):
+                try:
+                    model = RainfallTwoStageModel.load_model(model_path)
+                    self.forecast_models[time_point] = model
+                    print(f"成功加载{time_point}小时预报模型")
+                except Exception as e:
+                    print(f"加载{time_point}小时预报模型失败: {str(e)}")
+
+    def save_models(self, base_path):
+        """
+        保存预报模型
+        """
+        os.makedirs(base_path, exist_ok=True)
+        
+        for time_point, model in self.forecast_models.items():
+            model_path = os.path.join(base_path, f'model_{time_point}h.joblib')
+            model.save_model(model_path)
+            print(f"保存{time_point}小时预报模型到: {model_path}")
+
+    def visualize_forecast(self, forecast_results, output_dir='forecast_visualizations'):
+        """
+        可视化预报结果
+        """
+        import matplotlib.pyplot as plt
+        
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # 创建预报时间序列
+        time_points = sorted(forecast_results.keys())
+        rain_probs = [results['rain_probability'].mean() for results in forecast_results.values()]
+        rain_amounts = [results['rainfall_amount'].mean() for results in forecast_results.values()]
+        
+        # 绘制降雨概率
+        plt.figure(figsize=(10, 6))
+        plt.plot(time_points, rain_probs, 'b-', label='降雨概率')
+        plt.fill_between(time_points, 0, rain_probs, alpha=0.2)
+        plt.xlabel('预报时间（小时）')
+        plt.ylabel('降雨概率')
+        plt.title('未来降雨概率预报')
+        plt.grid(True)
+        plt.legend()
+        plt.savefig(os.path.join(output_dir, 'rain_probability.png'))
+        plt.close()
+        
+        # 绘制降雨量
+        plt.figure(figsize=(10, 6))
+        plt.plot(time_points, rain_amounts, 'r-', label='预计降雨量')
+        plt.fill_between(time_points, 0, rain_amounts, alpha=0.2)
+        plt.xlabel('预报时间（小时）')
+        plt.ylabel('降雨量 (mm)')
+        plt.title('未来降雨量预报')
+        plt.grid(True)
+        plt.legend()
+        plt.savefig(os.path.join(output_dir, 'rainfall_amount.png'))
+        plt.close()
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000..20321c6
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,232 @@
+import argparse
+import os
+from datetime import datetime
+import pandas as pd
+from database.db_connector import DatabaseConnector
+from data.data_processor import DataProcessor
+from models.rainfall_model import RainfallTwoStageModel
+from forecast.forecast_system import FutureRainfallForecastSystem
+from utils.visualization import Visualization
+
+def train_models(data_file, model_dir, n_features=35, clf_model='ensemble', reg_model='gbdt', time_based=True):
+    """
+    训练模型
+    """
+    print(f"加载训练数据: {data_file}")
+    try:
+        # 读取历史数据
+        historical_data = pd.read_csv(data_file)
+        
+        # 确保时间列是日期时间格式
+        if 'datetime' in historical_data.columns:
+            historical_data['datetime'] = pd.to_datetime(historical_data['datetime'])
+        
+        # 创建预报系统
+        forecast_system = FutureRainfallForecastSystem()
+        
+        # 训练模型
+        forecast_system.train_forecast_models(historical_data)
+        
+        # 保存模型
+        forecast_system.save_models(model_dir)
+        
+        print("模型训练完成")
+        
+    except Exception as e:
+        print(f"训练模型时出错: {str(e)}")
+
+def run_single_prediction(input_file, model_dir):
+    """
+    运行单次预测
+    """
+    print("启动单次预测...")
+    
+    # 检查模型目录是否存在
+    if not os.path.exists(model_dir):
+        print(f"错误: 未找到模型目录 '{model_dir}'")
+        print("请先使用历史数据训练模型")
+        return
+    
+    try:
+        # 读取输入数据
+        print(f"读取输入数据: {input_file}")
+        input_data = pd.read_csv(input_file)
+        
+        # 确保时间列是日期时间格式
+        if 'datetime' in input_data.columns:
+            input_data['datetime'] = pd.to_datetime(input_data['datetime'])
+        
+        # 创建预报系统
+        forecast_system = FutureRainfallForecastSystem()
+        
+        # 加载模型
+        print("加载预报模型...")
+        forecast_system.load_models(model_dir)
+        
+        if not forecast_system.forecast_models:
+            print("错误: 未能加载任何模型")
+            return
+        
+        # 进行预报
+        print("\n开始预测降雨情况...")
+        forecast_results = forecast_system.forecast(input_data)
+        
+        # 可视化预报结果
+        print("\n生成预报可视化...")
+        visualization = Visualization(output_dir='single_prediction_results')
+        visualization.plot_forecast_results(forecast_results)
+        
+        # 输出预报报告
+        print("\n降雨预报摘要:")
+        print("=" * 50)
+        
+        current_time = datetime.now()
+        print(f"预报时间: {current_time.strftime('%Y-%m-%d %H:%M')}")
+        
+        for time_point, result in forecast_results.items():
+            print(f"\n未来{time_point}小时预报:")
+            print("-" * 30)
+            
+            rain_prob = result['rain_probability'].mean()
+            rain_predicted = sum(result['rain_binary']) > 0
+            
+            print(f"降雨概率: {rain_prob:.2%}")
+            print(f"预计降雨: {'有' if rain_predicted else '无'}")
+            
+            if rain_predicted:
+                rain_amount = result['rainfall_amount'].mean()
+                print(f"预计降雨量: {rain_amount:.2f}mm")
+                
+                # 根据降雨量给出建议
+                if rain_amount >= 25:  # 大雨
+                    print("建议: 避免外出，注意防范强降雨可能带来的积水和交通影响")
+                elif rain_amount >= 10:  # 中雨
+                    print("建议: 出门请携带雨具，道路可能湿滑，驾车注意安全")
+                else:  # 小雨
+                    print("建议: 带好雨具，不会对出行造成太大影响")
+        
+        print("\n预测完成")
+        
+    except Exception as e:
+        print(f"运行单次预测时出错: {str(e)}")
+
+def run_realtime_forecast(model_dir):
+    """
+    运行实时预报
+    """
+    print("启动实时降雨预报系统...")
+    
+    # 检查模型目录是否存在
+    if not os.path.exists(model_dir):
+        print(f"错误: 未找到模型目录 '{model_dir}'")
+        print("请先使用历史数据训练模型")
+        return
+    
+    try:
+        # 创建数据库连接器
+        db_connector = DatabaseConnector()
+        
+        # 获取当前气象数据
+        print("获取当前气象数据...")
+        current_weather = db_connector.get_latest_weather_data()
+        
+        if current_weather is None or len(current_weather) == 0:
+            print("错误: 无法获取当前气象数据")
+            return
+        
+        # 创建预报系统
+        forecast_system = FutureRainfallForecastSystem()
+        
+        # 加载模型
+        print("加载预报模型...")
+        forecast_system.load_models(model_dir)
+        
+        if not forecast_system.forecast_models:
+            print("错误: 未能加载任何模型")
+            return
+        
+        # 进行预报
+        print("\n开始预测未来降雨情况...")
+        forecast_results = forecast_system.forecast(current_weather)
+        
+        # 可视化预报结果
+        print("\n生成预报可视化...")
+        visualization = Visualization(output_dir='realtime_forecast_results')
+        visualization.plot_forecast_results(forecast_results)
+        
+        # 输出预报报告
+        print("\n未来降雨预报摘要:")
+        print("=" * 50)
+        
+        current_time = datetime.now()
+        print(f"预报时间: {current_time.strftime('%Y-%m-%d %H:%M')}")
+        
+        for time_point, result in forecast_results.items():
+            print(f"\n未来{time_point}小时预报:")
+            print("-" * 30)
+            
+            rain_prob = result['rain_probability'].mean()
+            rain_predicted = sum(result['rain_binary']) > 0
+            
+            print(f"降雨概率: {rain_prob:.2%}")
+            print(f"预计降雨: {'有' if rain_predicted else '无'}")
+            
+            if rain_predicted:
+                rain_amount = result['rainfall_amount'].mean()
+                print(f"预计降雨量: {rain_amount:.2f}mm")
+                
+                # 根据降雨量给出建议
+                if rain_amount >= 25:  # 大雨
+                    print("建议: 避免外出，注意防范强降雨可能带来的积水和交通影响")
+                elif rain_amount >= 10:  # 中雨
+                    print("建议: 出门请携带雨具，道路可能湿滑，驾车注意安全")
+                else:  # 小雨
+                    print("建议: 带好雨具，不会对出行造成太大影响")
+        
+        print("\n预报完成")
+        
+    except Exception as e:
+        print(f"运行实时预报时出错: {str(e)}")
+    finally:
+        if 'db_connector' in locals():
+            db_connector.disconnect()
+
+def main():
+    parser = argparse.ArgumentParser(description="未来降雨预报系统")
+    parser.add_argument('--mode', type=str, default='train',
+                      choices=['train', 'predict', 'realtime'],
+                      help='执行模式: train-训练模型, predict-单次预测, realtime-实时预测')
+    parser.add_argument('--data', type=str, default='rainfall_data_time.csv',
+                      help='数据文件路径')
+    parser.add_argument('--input', type=str,
+                      help='单次预测的输入数据文件路径')
+    parser.add_argument('--model_dir', type=str, default='rainfall_forecast_models',
+                      help='模型保存/加载目录')
+    parser.add_argument('--n_features', type=int, default=35,
+                      help='使用的特征数量')
+    parser.add_argument('--clf_model', type=str, default='ensemble',
+                      choices=['lr', 'rf', 'gbdt', 'xgboost', 'ensemble'],
+                      help='分类器类型')
+    parser.add_argument('--reg_model', type=str, default='gbdt',
+                      choices=['rf', 'gbdt', 'ridge', 'xgboost'],
+                      help='回归器类型')
+    parser.add_argument('--time_based', action='store_true',
+                      help='是否使用基于时间的数据分割')
+    
+    args = parser.parse_args()
+    
+    if args.mode == 'train':
+        train_models(args.data, args.model_dir, args.n_features,
+                    args.clf_model, args.reg_model, args.time_based)
+    elif args.mode == 'predict':
+        if not args.input:
+            print("错误: 单次预测模式需要指定输入数据文件 (--input)")
+            return
+        run_single_prediction(args.input, args.model_dir)
+    elif args.mode == 'realtime':
+        run_realtime_forecast(args.model_dir)
+    else:
+        print(f"不支持的模式: {args.mode}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/models/rainfall_model.py b/src/models/rainfall_model.py
new file mode 100644
index 0000000..c83c350
--- /dev/null
+++ b/src/models/rainfall_model.py
@@ -0,0 +1,176 @@
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+import xgboost as xgb
+from sklearn.ensemble import VotingClassifier
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
+from sklearn.linear_model import Ridge
+import joblib
+import os
+
+class RainfallTwoStageModel:
+    def __init__(self, debug=True, clf_model_type='ensemble', reg_model_type='gbdt', output_dir='figures'):
+        """
+        初始化两阶段降雨预测模型
+        """
+        self.debug = debug
+        self.clf_model_type = clf_model_type
+        self.reg_model_type = reg_model_type
+        self.output_dir = output_dir
+        self.binary_classifier = None
+        self.regressor = None
+        self.category_classifier = None
+        self.threshold = 0.5
+
+    def build_binary_classifier(self):
+        """
+        构建二分类器
+        """
+        if self.clf_model_type == 'ensemble':
+            # 创建多个基础分类器
+            lr = LogisticRegression(max_iter=1000, random_state=42)
+            rf = RandomForestClassifier(n_estimators=100, random_state=42)
+            gbdt = GradientBoostingClassifier(n_estimators=100, random_state=42)
+            xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42)
+            
+            # 创建投票分类器
+            self.binary_classifier = VotingClassifier(
+                estimators=[
+                    ('lr', lr),
+                    ('rf', rf),
+                    ('gbdt', gbdt),
+                    ('xgb', xgb_clf)
+                ],
+                voting='soft'
+            )
+        elif self.clf_model_type == 'lr':
+            self.binary_classifier = LogisticRegression(max_iter=1000, random_state=42)
+        elif self.clf_model_type == 'rf':
+            self.binary_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
+        elif self.clf_model_type == 'gbdt':
+            self.binary_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
+        elif self.clf_model_type == 'xgboost':
+            self.binary_classifier = xgb.XGBClassifier(n_estimators=100, random_state=42)
+        else:
+            raise ValueError(f"不支持的分类器类型: {self.clf_model_type}")
+
+    def build_regressor(self):
+        """
+        构建回归器
+        """
+        if self.reg_model_type == 'rf':
+            self.regressor = RandomForestRegressor(n_estimators=100, random_state=42)
+        elif self.reg_model_type == 'gbdt':
+            self.regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
+        elif self.reg_model_type == 'ridge':
+            self.regressor = Ridge(alpha=1.0)
+        elif self.reg_model_type == 'xgboost':
+            self.regressor = xgb.XGBRegressor(n_estimators=100, random_state=42)
+        else:
+            raise ValueError(f"不支持的回归器类型: {self.reg_model_type}")
+
+    def train_binary_classifier(self, X_train, y_train, X_val=None, y_val=None):
+        """
+        训练二分类器
+        """
+        if self.binary_classifier is None:
+            self.build_binary_classifier()
+        
+        self.binary_classifier.fit(X_train, y_train)
+        
+        # 优化阈值
+        if X_val is not None and y_val is not None:
+            self.optimize_threshold(X_val, y_val)
+
+    def train_regressor(self, X_train, y_train, rain_mask):
+        """
+        训练回归器
+        """
+        if self.regressor is None:
+            self.build_regressor()
+        
+        # 只使用有雨的样本训练回归器
+        X_train_rain = X_train[rain_mask]
+        y_train_rain = y_train[rain_mask]
+        
+        if len(X_train_rain) > 0:
+            self.regressor.fit(X_train_rain, y_train_rain)
+
+    def optimize_threshold(self, X_val, y_val):
+        """
+        优化分类阈值
+        """
+        from sklearn.metrics import f1_score
+        
+        # 获取预测概率
+        y_pred_proba = self.binary_classifier.predict_proba(X_val)[:, 1]
+        
+        # 尝试不同的阈值
+        thresholds = np.arange(0.1, 1.0, 0.1)
+        best_f1 = 0
+        best_threshold = 0.5
+        
+        for threshold in thresholds:
+            y_pred = (y_pred_proba >= threshold).astype(int)
+            f1 = f1_score(y_val, y_pred)
+            
+            if f1 > best_f1:
+                best_f1 = f1
+                best_threshold = threshold
+        
+        self.threshold = best_threshold
+
+    def predict(self, X):
+        """
+        预测降雨情况
+        """
+        # 二分类预测
+        y_pred_proba = self.binary_classifier.predict_proba(X)[:, 1]
+        rain_mask = y_pred_proba >= self.threshold
+        
+        # 回归预测
+        y_pred_amount = np.zeros(len(X))
+        if self.regressor is not None and any(rain_mask):
+            y_pred_amount[rain_mask] = self.regressor.predict(X[rain_mask])
+        
+        return {
+            'rain_probability': y_pred_proba,
+            'rain_binary': rain_mask.astype(int),
+            'rainfall_amount': y_pred_amount
+        }
+
+    def save_model(self, filename='rainfall_two_stage_model.joblib'):
+        """
+        保存模型
+        """
+        model_data = {
+            'binary_classifier': self.binary_classifier,
+            'regressor': self.regressor,
+            'threshold': self.threshold,
+            'clf_model_type': self.clf_model_type,
+            'reg_model_type': self.reg_model_type
+        }
+        
+        joblib.dump(model_data, filename)
+
+    @classmethod
+    def load_model(cls, filename='rainfall_two_stage_model.joblib'):
+        """
+        加载模型
+        """
+        if not os.path.exists(filename):
+            raise FileNotFoundError(f"模型文件不存在: {filename}")
+        
+        model_data = joblib.load(filename)
+        
+        model = cls(
+            clf_model_type=model_data['clf_model_type'],
+            reg_model_type=model_data['reg_model_type']
+        )
+        
+        model.binary_classifier = model_data['binary_classifier']
+        model.regressor = model_data['regressor']
+        model.threshold = model_data['threshold']
+        
+        return model
\ No newline at end of file
diff --git a/src/utils/feature_engineering.py b/src/utils/feature_engineering.py
new file mode 100644
index 0000000..14edb45
--- /dev/null
+++ b/src/utils/feature_engineering.py
@@ -0,0 +1,129 @@
+import numpy as np
+import pandas as pd
+
+class FeatureEngineering:
+    @staticmethod
+    def calculate_time_features(df):
+        """
+        计算时间相关特征
+        """
+        df['hour'] = df['datetime'].dt.hour
+        df['day'] = df['datetime'].dt.day
+        df['month'] = df['datetime'].dt.month
+        df['day_of_week'] = df['datetime'].dt.dayofweek
+        return df
+
+    @staticmethod
+    def calculate_change_features(df, variables):
+        """
+        计算变化率特征
+        """
+        for var in variables:
+            # 1小时变化
+            df[f'{var}_1h_change'] = df[var].diff(1)
+            
+            # 3小时变化
+            df[f'{var}_3h_change'] = df[var].diff(3)
+            
+            # 6小时变化
+            df[f'{var}_6h_change'] = df[var].diff(6)
+            
+            # 计算滚动均值
+            df[f'{var}_3h_mean'] = df[var].rolling(window=3, min_periods=1).mean()
+            df[f'{var}_6h_mean'] = df[var].rolling(window=6, min_periods=1).mean()
+            
+            # 滚动标准差
+            df[f'{var}_6h_std'] = df[var].rolling(window=6, min_periods=1).std().fillna(0)
+            
+            # 异常值
+            var_mean = df[var].mean()
+            var_std = df[var].std()
+            df[f'{var}_anomaly'] = (df[var] - var_mean) / var_std if var_std > 0 else 0
+            
+            # 急剧变化指标
+            var_std = df[f'{var}_1h_change'].std()
+            if var_std > 0:
+                df[f'{var}_rapid_change'] = (np.abs(df[f'{var}_1h_change']) > (2 * var_std)).astype(int)
+            else:
+                df[f'{var}_rapid_change'] = 0
+            
+            # 变化加速度
+            df[f'{var}_change_accel'] = df[f'{var}_1h_change'].diff(1).fillna(0)
+        
+        return df
+
+    @staticmethod
+    def calculate_combined_features(df):
+        """
+        计算组合特征
+        """
+        # PWV和湿度相关特征
+        if all(var in df.columns for var in ['PWV', 'rh']):
+            df['PWV_rh'] = df['PWV'] * df['rh'] / 100
+            df['PWV_cumul_change'] = df['PWV'].diff(1).rolling(window=6).sum().fillna(0)
+            df['rh_cumul_change'] = df['rh'].diff(1).rolling(window=6).sum().fillna(0)
+
+        # PWV和温度相关特征
+        if all(var in df.columns for var in ['PWV', 'Temp']):
+            df['PWV_Temp'] = df['PWV'] * df['Temp']
+            temp_exp = np.exp(17.27 * (df['Temp']-273.15) / ((df['Temp']-273.15) + 237.3))
+            df['saturation_index'] = df['PWV'] / (0.1 * temp_exp)
+
+        # 气压相关特征
+        if all(var in df.columns for var in ['Press', 'Press_1h_change']):
+            df['Press_change_intensity'] = df['Press_1h_change'].abs()
+            df['Press_falling'] = (df['Press_1h_change'] < 0).astype(int)
+            press_quantile = df['Press_1h_change'].abs().quantile(0.75)
+            df['Press_rapid_fall'] = ((df['Press_1h_change'] < 0) & 
+                                    (df['Press_1h_change'].abs() > press_quantile)).astype(int)
+
+        # ZTD和PWV相关特征
+        if all(var in df.columns for var in ['ZTD', 'PWV']):
+            df['ZTD_PWV_ratio'] = df['ZTD'] / df['PWV'].replace(0, np.nan).fillna(df['PWV'].mean())
+            if all(var in df.columns for var in ['ZTD_1h_change', 'PWV_1h_change']):
+                df['ZTD_PWV_change'] = df['ZTD_1h_change'] * df['PWV_1h_change']
+            
+            pwv_mean = df['PWV_6h_mean'].mean()
+            if pwv_mean > 0:
+                df['moisture_instability'] = df['PWV_6h_std'] / df['PWV_6h_mean']
+            else:
+                df['moisture_instability'] = 0
+
+        # 降雨潜力指标
+        if all(var in df.columns for var in ['PWV', 'rh', 'Temp', 'Press_1h_change']):
+            df['rain_potential'] = ((df['PWV'] * df['rh'] / 100) * 
+                                  (0.1 - df['Press_1h_change']) / 
+                                  (df['Temp'] + 10))
+            
+            pwv_mean = df['PWV'].mean() + 0.1
+            df['heavy_rain_potential'] = (df['PWV'] * (df['rh'] / 100) * 
+                                        (df['Press_rapid_fall'] + 1) * 
+                                        (1 + np.abs(df['PWV_1h_change']) / 
+                                         pwv_mean))
+            
+            df['rain_persistence'] = (df['PWV_cumul_change'] * 
+                                    df['rh_cumul_change'] * 
+                                    (1 + df['Press_change_intensity']))
+
+        return df
+
+    @staticmethod
+    def process_features(df):
+        """
+        处理所有特征
+        """
+        # 确保datetime列是datetime类型
+        if 'datetime' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+            df['datetime'] = pd.to_datetime(df['datetime'])
+
+        # 计算时间特征
+        df = FeatureEngineering.calculate_time_features(df)
+
+        # 计算变化率特征
+        variables = ['PWV', 'ZTD', 'Temp', 'Press', 'rh']
+        df = FeatureEngineering.calculate_change_features(df, variables)
+
+        # 计算组合特征
+        df = FeatureEngineering.calculate_combined_features(df)
+
+        return df
\ No newline at end of file
diff --git a/src/utils/visualization.py b/src/utils/visualization.py
new file mode 100644
index 0000000..7beadec
--- /dev/null
+++ b/src/utils/visualization.py
@@ -0,0 +1,113 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+import os
+
+class Visualization:
+    def __init__(self, output_dir='figures'):
+        """
+        初始化可视化工具
+        """
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+
+    def plot_feature_importance(self, feature_importance, title='特征重要性'):
+        """
+        绘制特征重要性图
+        """
+        plt.figure(figsize=(12, 6))
+        feature_importance.sort_values().plot(kind='barh')
+        plt.title(title)
+        plt.xlabel('重要性')
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'feature_importance.png'))
+        plt.close()
+
+    def plot_rainfall_distribution(self, data, title='降雨量分布'):
+        """
+        绘制降雨量分布图
+        """
+        plt.figure(figsize=(10, 6))
+        sns.histplot(data=data, x='rainfall', bins=50)
+        plt.title(title)
+        plt.xlabel('降雨量 (mm)')
+        plt.ylabel('频次')
+        plt.savefig(os.path.join(self.output_dir, 'rainfall_distribution.png'))
+        plt.close()
+
+    def plot_time_series(self, data, variables, title='时间序列数据'):
+        """
+        绘制时间序列图
+        """
+        plt.figure(figsize=(15, 8))
+        for var in variables:
+            plt.plot(data['datetime'], data[var], label=var)
+        plt.title(title)
+        plt.xlabel('时间')
+        plt.ylabel('值')
+        plt.legend()
+        plt.grid(True)
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'time_series.png'))
+        plt.close()
+
+    def plot_correlation_matrix(self, data, title='特征相关性矩阵'):
+        """
+        绘制相关性矩阵热力图
+        """
+        plt.figure(figsize=(12, 10))
+        correlation_matrix = data.corr()
+        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
+        plt.title(title)
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'correlation_matrix.png'))
+        plt.close()
+
+    def plot_forecast_results(self, forecast_results, title='降雨预报结果'):
+        """
+        绘制预报结果图
+        """
+        time_points = sorted(forecast_results.keys())
+        rain_probs = [results['rain_probability'].mean() for results in forecast_results.values()]
+        rain_amounts = [results['rainfall_amount'].mean() for results in forecast_results.values()]
+
+        # 创建子图
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
+
+        # 绘制降雨概率
+        ax1.plot(time_points, rain_probs, 'b-', label='降雨概率')
+        ax1.fill_between(time_points, 0, rain_probs, alpha=0.2)
+        ax1.set_xlabel('预报时间（小时）')
+        ax1.set_ylabel('降雨概率')
+        ax1.set_title('未来降雨概率预报')
+        ax1.grid(True)
+        ax1.legend()
+
+        # 绘制降雨量
+        ax2.plot(time_points, rain_amounts, 'r-', label='预计降雨量')
+        ax2.fill_between(time_points, 0, rain_amounts, alpha=0.2)
+        ax2.set_xlabel('预报时间（小时）')
+        ax2.set_ylabel('降雨量 (mm)')
+        ax2.set_title('未来降雨量预报')
+        ax2.grid(True)
+        ax2.legend()
+
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'forecast_results.png'))
+        plt.close()
+
+    def plot_model_evaluation(self, y_true, y_pred, title='模型评估'):
+        """
+        绘制模型评估图
+        """
+        plt.figure(figsize=(10, 6))
+        plt.scatter(y_true, y_pred, alpha=0.5)
+        plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
+        plt.xlabel('实际值')
+        plt.ylabel('预测值')
+        plt.title(title)
+        plt.grid(True)
+        plt.savefig(os.path.join(self.output_dir, 'model_evaluation.png'))
+        plt.close()
\ No newline at end of file