first commit

2025-05-21 11:33:39 +08:00 · 2025-05-21 11:33:39 +08:00 · c3962e21b9
commit c3962e21b9
9 changed files with 4044 additions and 0 deletions
--- a/rain_forcast_3h.py
+++ b/rain_forcast_3h.py
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
+numpy>=1.21.0
+pandas>=1.3.0
+scikit-learn>=0.24.2
+xgboost>=1.4.2
+matplotlib>=3.4.2
+seaborn>=0.11.1
+pymysql>=1.0.2
+joblib>=1.0.1
--- a/src/data/data_processor.py
+++ b/src/data/data_processor.py
@ -0,0 +1,142 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from ..utils.feature_engineering import FeatureEngineering
+
+class DataProcessor:
+    def __init__(self, debug=True):
+        """
+        初始化数据处理器
+        """
+        self.debug = debug
+        self.scaler = StandardScaler()
+        self.feature_columns = None
+        self.target_columns = None
+
+    def debug_print(self, message):
+        """
+        调试信息打印
+        """
+        if self.debug:
+            print(message)
+
+    def preprocess_data(self, data, silent=False):
+        """
+        数据预处理
+        """
+        if not silent:
+            self.debug_print("开始数据预处理...")
+
+        # 复制数据以避免修改原始数据
+        df = data.copy()
+
+        # 确保datetime列是datetime类型
+        if 'datetime' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+            df['datetime'] = pd.to_datetime(df['datetime'])
+
+        # 处理缺失值
+        df = self._handle_missing_values(df)
+
+        # 特征工程
+        df = FeatureEngineering.process_features(df)
+
+        # 移除包含NaN的行
+        df = df.dropna()
+
+        if not silent:
+            self.debug_print(f"预处理完成，数据形状: {df.shape}")
+
+        return df
+
+    def _handle_missing_values(self, df):
+        """
+        处理缺失值
+        """
+        # 对于时间序列数据，使用前向填充
+        df = df.fillna(method='ffill')
+        
+        # 对于剩余的缺失值，使用列的中位数填充
+        df = df.fillna(df.median())
+        
+        return df
+
+    def select_features(self, data, n_features=30, silent=False):
+        """
+        特征选择
+        """
+        if not silent:
+            self.debug_print(f"选择前{n_features}个特征...")
+
+        # 计算特征重要性
+        feature_importance = self._calculate_feature_importance(data)
+        
+        # 选择最重要的特征
+        selected_features = feature_importance.nlargest(n_features).index.tolist()
+        
+        if not silent:
+            self.debug_print(f"选择的特征: {selected_features}")
+
+        return selected_features
+
+    def _calculate_feature_importance(self, data):
+        """
+        计算特征重要性
+        """
+        # 这里使用一个简单的基于相关性的特征重要性计算
+        # 在实际应用中，你可能需要使用更复杂的方法
+        correlations = data.corr()['rainfall'].abs()
+        return correlations.drop('rainfall')
+
+    def split_data(self, X, y_binary, y_amount, y_category=None, time_based=False, test_size=0.2):
+        """
+        数据分割
+        """
+        if time_based:
+            # 基于时间的分割
+            split_idx = int(len(X) * (1 - test_size))
+            X_train = X[:split_idx]
+            X_test = X[split_idx:]
+            y_train_binary = y_binary[:split_idx]
+            y_test_binary = y_binary[split_idx:]
+            y_train_amount = y_amount[:split_idx]
+            y_test_amount = y_amount[split_idx:]
+            
+            if y_category is not None:
+                y_train_category = y_category[:split_idx]
+                y_test_category = y_category[split_idx:]
+            else:
+                y_train_category = None
+                y_test_category = None
+        else:
+            # 随机分割
+            from sklearn.model_selection import train_test_split
+            X_train, X_test, y_train_binary, y_test_binary, y_train_amount, y_test_amount = train_test_split(
+                X, y_binary, y_amount, test_size=test_size, random_state=42
+            )
+            
+            if y_category is not None:
+                _, _, y_train_category, y_test_category = train_test_split(
+                    X, y_category, test_size=test_size, random_state=42
+                )
+            else:
+                y_train_category = None
+                y_test_category = None
+
+        return X_train, X_test, y_train_binary, y_test_binary, y_train_amount, y_test_amount, y_train_category, y_test_category
+
+    def scale_features(self, X_train, X_test):
+        """
+        特征缩放
+        """
+        # 保存特征列名
+        self.feature_columns = X_train.columns
+        
+        # 缩放特征
+        X_train_scaled = self.scaler.fit_transform(X_train)
+        X_test_scaled = self.scaler.transform(X_test)
+        
+        # 转换回DataFrame
+        X_train_scaled = pd.DataFrame(X_train_scaled, columns=self.feature_columns)
+        X_test_scaled = pd.DataFrame(X_test_scaled, columns=self.feature_columns)
+        
+        return X_train_scaled, X_test_scaled
--- a/src/database/db_connector.py
+++ b/src/database/db_connector.py
@ -0,0 +1,236 @@
+import pandas as pd
+import pymysql
+from datetime import datetime, timedelta
+
+class DatabaseConnector:
+    def __init__(self, rtk_host='localhost', rtk_user='root', rtk_password='root', rtk_database='rtk_data',
+                 rain_host='8.134.185.53', rain_user='remote', rain_password='root', rain_database='rain_db'):
+        """
+        初始化数据库连接器
+        """
+        # RTK数据库配置
+        self.rtk_host = 'localhost'
+        self.rtk_user = 'root'
+        self.rtk_password = 'root'
+        self.rtk_database = 'rtk_data'
+        self.rtk_connection = None
+
+        # 气象数据库配置
+        self.rain_host = '8.134.185.53'
+        self.rain_user = 'remote'
+        self.rain_password = 'root'
+        self.rain_database = 'rain_db'
+        self.rain_connection = None
+
+    def connect_rtk(self):
+        """
+        建立RTK数据库连接
+        """
+        try:
+            self.rtk_connection = pymysql.connect(
+                host=self.rtk_host,
+                user=self.rtk_user,
+                password=self.rtk_password,
+                database=self.rtk_database
+            )
+            return True
+        except Exception as e:
+            print(f"RTK数据库连接失败: {str(e)}")
+            return False
+
+    def connect_rain(self):
+        """
+        建立气象数据库连接
+        """
+        try:
+            self.rain_connection = pymysql.connect(
+                host=self.rain_host,
+                user=self.rain_user,
+                password=self.rain_password,
+                database=self.rain_database
+            )
+            return True
+        except Exception as e:
+            print(f"气象数据库连接失败: {str(e)}")
+            return False
+
+    def disconnect(self):
+        """
+        关闭所有数据库连接
+        """
+        if self.rtk_connection:
+            self.rtk_connection.close()
+            self.rtk_connection = None
+        if self.rain_connection:
+            self.rain_connection.close()
+            self.rain_connection = None
+
+    def get_latest_weather_data(self):
+        """
+        获取最新的气象数据，合并RTK和气象数据的最新记录
+        使用RTK数据的时间作为基准时间
+        """
+        # 获取RTK数据
+        if not self.rtk_connection:
+            if not self.connect_rtk():
+                return None
+
+        try:
+            # 获取RTK数据最新记录
+            rtk_query = """
+                SELECT 
+                    timestamp,
+                    pwv,
+                    ztd,
+                    ztd_gradient_east,
+                    ztd_gradient_north
+                FROM rtk_data
+                ORDER BY timestamp DESC
+                LIMIT 1
+            """
+            rtk_df = pd.read_sql(rtk_query, self.rtk_connection)
+            
+            if rtk_df.empty:
+                print("RTK数据库中没有数据")
+                return None
+
+            # 获取气象数据最新记录
+            if not self.rain_connection:
+                if not self.connect_rain():
+                    print("无法连接到气象数据库")
+                    return rtk_df
+
+            rain_query = """
+                SELECT 
+                    wind_speed,
+                    wind_force,
+                    wind_direction_360 as wind_direction,
+                    humidity,
+                    temperature,
+                    atm_pressure,
+                    solar_radiation,
+                    rainfall
+                FROM sensor_data
+                ORDER BY timestamp DESC
+                LIMIT 1
+            """
+            rain_df = pd.read_sql(rain_query, self.rain_connection)
+            
+            if rain_df.empty:
+                print("气象数据库中没有数据")
+                return rtk_df
+
+            # 重命名RTK数据列
+            rtk_df = rtk_df.rename(columns={
+                'timestamp': 'datetime',
+                'pwv': 'PWV',
+                'ztd': 'ZTD'
+            })
+
+            # 重命名气象数据列
+            rain_df = rain_df.rename(columns={
+                'humidity': 'rh',
+                'temperature': 'Temp',
+                'atm_pressure': 'Press',
+                'wind_speed': 'ws'
+            })
+
+            # 将RTK的时间戳转换为datetime类型
+            rtk_df['datetime'] = pd.to_datetime(rtk_df['datetime'])
+
+            # 合并数据，使用RTK的时间
+            merged_df = pd.concat([rtk_df, rain_df], axis=1)
+            
+            # 删除重复的列（如果有的话）
+            merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]
+            
+            return merged_df
+
+        except Exception as e:
+            print(f"获取数据失败: {str(e)}")
+            return None
+
+    def get_historical_data(self, start_date, end_date):
+        """
+        获取历史数据，合并RTK和气象数据
+        """
+        # 获取RTK数据
+        if not self.rtk_connection:
+            if not self.connect_rtk():
+                return None
+
+        try:
+            # 获取RTK数据
+            rtk_query = """
+                SELECT 
+                    timestamp,
+                    pwv,
+                    ztd,
+                    ztd_gradient_east,
+                    ztd_gradient_north
+                FROM rtk_data
+                WHERE timestamp BETWEEN %s AND %s
+                ORDER BY timestamp
+            """
+            rtk_df = pd.read_sql(rtk_query, self.rtk_connection, params=[start_date, end_date])
+            
+            # 重命名RTK数据列
+            rtk_df = rtk_df.rename(columns={
+                'timestamp': 'datetime',
+                'pwv': 'PWV',
+                'ztd': 'ZTD'
+            })
+
+            # 获取气象数据
+            if not self.rain_connection:
+                if not self.connect_rain():
+                    return rtk_df  # 如果气象数据库连接失败，返回RTK数据
+
+            rain_query = """
+                SELECT 
+                    timestamp,
+                    wind_speed,
+                    wind_force,
+                    wind_direction_360 as wind_direction,
+                    humidity,
+                    temperature,
+                    atm_pressure,
+                    solar_radiation,
+                    rainfall
+                FROM sensor_data
+                WHERE timestamp BETWEEN %s AND %s
+                ORDER BY timestamp
+            """
+            rain_df = pd.read_sql(rain_query, self.rain_connection, params=[start_date, end_date])
+            
+            # 重命名气象数据列
+            rain_df = rain_df.rename(columns={
+                'timestamp': 'datetime',
+                'humidity': 'rh',
+                'temperature': 'Temp',
+                'atm_pressure': 'Press',
+                'wind_speed': 'ws'
+            })
+
+            # 合并数据
+            if not rain_df.empty:
+                # 将时间戳转换为datetime类型
+                rtk_df['datetime'] = pd.to_datetime(rtk_df['datetime'])
+                rain_df['datetime'] = pd.to_datetime(rain_df['datetime'])
+                
+                # 合并数据
+                merged_df = pd.merge(rtk_df, rain_df, on='datetime', how='outer')
+                
+                # 按时间排序
+                merged_df = merged_df.sort_values('datetime')
+                
+                # 填充缺失值
+                merged_df = merged_df.fillna(method='ffill')
+                
+                return merged_df
+            else:
+                return rtk_df
+
+        except Exception as e:
+            print(f"获取历史数据失败: {str(e)}")
+            return None
--- a/src/forecast/forecast_system.py
+++ b/src/forecast/forecast_system.py
@ -0,0 +1,154 @@
+import os
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from ..models.rainfall_model import RainfallTwoStageModel
+from ..data.data_processor import DataProcessor
+from ..utils.feature_engineering import FeatureEngineering
+
+class FutureRainfallForecastSystem:
+    def __init__(self, model_path=None, debug=False):
+        """
+        初始化未来降雨预报系统
+        """
+        self.debug = debug
+        self.forecast_models = {}
+        self.data_processor = DataProcessor(debug=debug)
+        self.model_path = model_path
+
+    def train_forecast_models(self, data, features_config=None):
+        """
+        训练多个时间点的预测模型
+        """
+        print("开始训练未来降雨预报模型...")
+        
+        # 预处理数据
+        processed_data = self.data_processor.preprocess_data(data)
+        
+        # 为每个预测时间点训练模型
+        for time_point in [1, 3, 6, 12, 24]:  # 预测未来1, 3, 6, 12, 24小时
+            print(f"\n训练{time_point}小时预报模型...")
+            
+            # 准备训练数据
+            future_data = self._prepare_future_data(processed_data, time_point)
+            
+            # 创建和训练模型
+            model = RainfallTwoStageModel(debug=self.debug)
+            model.train_and_evaluate(future_data)
+            
+            # 保存模型
+            self.forecast_models[time_point] = model
+            
+            print(f"{time_point}小时预报模型训练完成")
+
+    def _prepare_future_data(self, data, time_point):
+        """
+        准备未来时间点的训练数据
+        """
+        # 创建时间偏移
+        future_data = data.copy()
+        future_data['datetime'] = future_data['datetime'] + timedelta(hours=time_point)
+        
+        # 合并当前数据和未来数据
+        merged_data = pd.merge(
+            data,
+            future_data[['datetime', 'rainfall']],
+            on='datetime',
+            suffixes=('', '_future')
+        )
+        
+        # 重命名目标变量
+        merged_data = merged_data.rename(columns={'rainfall_future': 'rainfall'})
+        
+        return merged_data
+
+    def forecast(self, current_data):
+        """
+        进行未来降雨预报
+        """
+        if not self.forecast_models:
+            raise ValueError("没有可用的预报模型，请先训练模型")
+
+        # 预处理当前数据
+        processed_data = self.data_processor.preprocess_data(current_data)
+        
+        # 存储预报结果
+        forecast_results = {}
+        
+        # 对每个时间点进行预报
+        for time_point, model in self.forecast_models.items():
+            # 准备预报数据
+            forecast_data = self._prepare_future_data(processed_data, time_point)
+            
+            # 进行预报
+            predictions = model.predict(forecast_data)
+            
+            # 存储结果
+            forecast_results[time_point] = predictions
+        
+        return forecast_results
+
+    def load_models(self, base_path):
+        """
+        加载预报模型
+        """
+        if not os.path.exists(base_path):
+            raise FileNotFoundError(f"模型目录不存在: {base_path}")
+        
+        for time_point in [1, 3, 6, 12, 24]:
+            model_path = os.path.join(base_path, f'model_{time_point}h.joblib')
+            if os.path.exists(model_path):
+                try:
+                    model = RainfallTwoStageModel.load_model(model_path)
+                    self.forecast_models[time_point] = model
+                    print(f"成功加载{time_point}小时预报模型")
+                except Exception as e:
+                    print(f"加载{time_point}小时预报模型失败: {str(e)}")
+
+    def save_models(self, base_path):
+        """
+        保存预报模型
+        """
+        os.makedirs(base_path, exist_ok=True)
+        
+        for time_point, model in self.forecast_models.items():
+            model_path = os.path.join(base_path, f'model_{time_point}h.joblib')
+            model.save_model(model_path)
+            print(f"保存{time_point}小时预报模型到: {model_path}")
+
+    def visualize_forecast(self, forecast_results, output_dir='forecast_visualizations'):
+        """
+        可视化预报结果
+        """
+        import matplotlib.pyplot as plt
+        
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # 创建预报时间序列
+        time_points = sorted(forecast_results.keys())
+        rain_probs = [results['rain_probability'].mean() for results in forecast_results.values()]
+        rain_amounts = [results['rainfall_amount'].mean() for results in forecast_results.values()]
+        
+        # 绘制降雨概率
+        plt.figure(figsize=(10, 6))
+        plt.plot(time_points, rain_probs, 'b-', label='降雨概率')
+        plt.fill_between(time_points, 0, rain_probs, alpha=0.2)
+        plt.xlabel('预报时间（小时）')
+        plt.ylabel('降雨概率')
+        plt.title('未来降雨概率预报')
+        plt.grid(True)
+        plt.legend()
+        plt.savefig(os.path.join(output_dir, 'rain_probability.png'))
+        plt.close()
+        
+        # 绘制降雨量
+        plt.figure(figsize=(10, 6))
+        plt.plot(time_points, rain_amounts, 'r-', label='预计降雨量')
+        plt.fill_between(time_points, 0, rain_amounts, alpha=0.2)
+        plt.xlabel('预报时间（小时）')
+        plt.ylabel('降雨量 (mm)')
+        plt.title('未来降雨量预报')
+        plt.grid(True)
+        plt.legend()
+        plt.savefig(os.path.join(output_dir, 'rainfall_amount.png'))
+        plt.close()
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,232 @@
+import argparse
+import os
+from datetime import datetime
+import pandas as pd
+from database.db_connector import DatabaseConnector
+from data.data_processor import DataProcessor
+from models.rainfall_model import RainfallTwoStageModel
+from forecast.forecast_system import FutureRainfallForecastSystem
+from utils.visualization import Visualization
+
+def train_models(data_file, model_dir, n_features=35, clf_model='ensemble', reg_model='gbdt', time_based=True):
+    """
+    训练模型
+    """
+    print(f"加载训练数据: {data_file}")
+    try:
+        # 读取历史数据
+        historical_data = pd.read_csv(data_file)
+        
+        # 确保时间列是日期时间格式
+        if 'datetime' in historical_data.columns:
+            historical_data['datetime'] = pd.to_datetime(historical_data['datetime'])
+        
+        # 创建预报系统
+        forecast_system = FutureRainfallForecastSystem()
+        
+        # 训练模型
+        forecast_system.train_forecast_models(historical_data)
+        
+        # 保存模型
+        forecast_system.save_models(model_dir)
+        
+        print("模型训练完成")
+        
+    except Exception as e:
+        print(f"训练模型时出错: {str(e)}")
+
+def run_single_prediction(input_file, model_dir):
+    """
+    运行单次预测
+    """
+    print("启动单次预测...")
+    
+    # 检查模型目录是否存在
+    if not os.path.exists(model_dir):
+        print(f"错误: 未找到模型目录 '{model_dir}'")
+        print("请先使用历史数据训练模型")
+        return
+    
+    try:
+        # 读取输入数据
+        print(f"读取输入数据: {input_file}")
+        input_data = pd.read_csv(input_file)
+        
+        # 确保时间列是日期时间格式
+        if 'datetime' in input_data.columns:
+            input_data['datetime'] = pd.to_datetime(input_data['datetime'])
+        
+        # 创建预报系统
+        forecast_system = FutureRainfallForecastSystem()
+        
+        # 加载模型
+        print("加载预报模型...")
+        forecast_system.load_models(model_dir)
+        
+        if not forecast_system.forecast_models:
+            print("错误: 未能加载任何模型")
+            return
+        
+        # 进行预报
+        print("\n开始预测降雨情况...")
+        forecast_results = forecast_system.forecast(input_data)
+        
+        # 可视化预报结果
+        print("\n生成预报可视化...")
+        visualization = Visualization(output_dir='single_prediction_results')
+        visualization.plot_forecast_results(forecast_results)
+        
+        # 输出预报报告
+        print("\n降雨预报摘要:")
+        print("=" * 50)
+        
+        current_time = datetime.now()
+        print(f"预报时间: {current_time.strftime('%Y-%m-%d %H:%M')}")
+        
+        for time_point, result in forecast_results.items():
+            print(f"\n未来{time_point}小时预报:")
+            print("-" * 30)
+            
+            rain_prob = result['rain_probability'].mean()
+            rain_predicted = sum(result['rain_binary']) > 0
+            
+            print(f"降雨概率: {rain_prob:.2%}")
+            print(f"预计降雨: {'有' if rain_predicted else '无'}")
+            
+            if rain_predicted:
+                rain_amount = result['rainfall_amount'].mean()
+                print(f"预计降雨量: {rain_amount:.2f}mm")
+                
+                # 根据降雨量给出建议
+                if rain_amount >= 25:  # 大雨
+                    print("建议: 避免外出，注意防范强降雨可能带来的积水和交通影响")
+                elif rain_amount >= 10:  # 中雨
+                    print("建议: 出门请携带雨具，道路可能湿滑，驾车注意安全")
+                else:  # 小雨
+                    print("建议: 带好雨具，不会对出行造成太大影响")
+        
+        print("\n预测完成")
+        
+    except Exception as e:
+        print(f"运行单次预测时出错: {str(e)}")
+
+def run_realtime_forecast(model_dir):
+    """
+    运行实时预报
+    """
+    print("启动实时降雨预报系统...")
+    
+    # 检查模型目录是否存在
+    if not os.path.exists(model_dir):
+        print(f"错误: 未找到模型目录 '{model_dir}'")
+        print("请先使用历史数据训练模型")
+        return
+    
+    try:
+        # 创建数据库连接器
+        db_connector = DatabaseConnector()
+        
+        # 获取当前气象数据
+        print("获取当前气象数据...")
+        current_weather = db_connector.get_latest_weather_data()
+        
+        if current_weather is None or len(current_weather) == 0:
+            print("错误: 无法获取当前气象数据")
+            return
+        
+        # 创建预报系统
+        forecast_system = FutureRainfallForecastSystem()
+        
+        # 加载模型
+        print("加载预报模型...")
+        forecast_system.load_models(model_dir)
+        
+        if not forecast_system.forecast_models:
+            print("错误: 未能加载任何模型")
+            return
+        
+        # 进行预报
+        print("\n开始预测未来降雨情况...")
+        forecast_results = forecast_system.forecast(current_weather)
+        
+        # 可视化预报结果
+        print("\n生成预报可视化...")
+        visualization = Visualization(output_dir='realtime_forecast_results')
+        visualization.plot_forecast_results(forecast_results)
+        
+        # 输出预报报告
+        print("\n未来降雨预报摘要:")
+        print("=" * 50)
+        
+        current_time = datetime.now()
+        print(f"预报时间: {current_time.strftime('%Y-%m-%d %H:%M')}")
+        
+        for time_point, result in forecast_results.items():
+            print(f"\n未来{time_point}小时预报:")
+            print("-" * 30)
+            
+            rain_prob = result['rain_probability'].mean()
+            rain_predicted = sum(result['rain_binary']) > 0
+            
+            print(f"降雨概率: {rain_prob:.2%}")
+            print(f"预计降雨: {'有' if rain_predicted else '无'}")
+            
+            if rain_predicted:
+                rain_amount = result['rainfall_amount'].mean()
+                print(f"预计降雨量: {rain_amount:.2f}mm")
+                
+                # 根据降雨量给出建议
+                if rain_amount >= 25:  # 大雨
+                    print("建议: 避免外出，注意防范强降雨可能带来的积水和交通影响")
+                elif rain_amount >= 10:  # 中雨
+                    print("建议: 出门请携带雨具，道路可能湿滑，驾车注意安全")
+                else:  # 小雨
+                    print("建议: 带好雨具，不会对出行造成太大影响")
+        
+        print("\n预报完成")
+        
+    except Exception as e:
+        print(f"运行实时预报时出错: {str(e)}")
+    finally:
+        if 'db_connector' in locals():
+            db_connector.disconnect()
+
+def main():
+    parser = argparse.ArgumentParser(description="未来降雨预报系统")
+    parser.add_argument('--mode', type=str, default='train',
+                      choices=['train', 'predict', 'realtime'],
+                      help='执行模式: train-训练模型, predict-单次预测, realtime-实时预测')
+    parser.add_argument('--data', type=str, default='rainfall_data_time.csv',
+                      help='数据文件路径')
+    parser.add_argument('--input', type=str,
+                      help='单次预测的输入数据文件路径')
+    parser.add_argument('--model_dir', type=str, default='rainfall_forecast_models',
+                      help='模型保存/加载目录')
+    parser.add_argument('--n_features', type=int, default=35,
+                      help='使用的特征数量')
+    parser.add_argument('--clf_model', type=str, default='ensemble',
+                      choices=['lr', 'rf', 'gbdt', 'xgboost', 'ensemble'],
+                      help='分类器类型')
+    parser.add_argument('--reg_model', type=str, default='gbdt',
+                      choices=['rf', 'gbdt', 'ridge', 'xgboost'],
+                      help='回归器类型')
+    parser.add_argument('--time_based', action='store_true',
+                      help='是否使用基于时间的数据分割')
+    
+    args = parser.parse_args()
+    
+    if args.mode == 'train':
+        train_models(args.data, args.model_dir, args.n_features,
+                    args.clf_model, args.reg_model, args.time_based)
+    elif args.mode == 'predict':
+        if not args.input:
+            print("错误: 单次预测模式需要指定输入数据文件 (--input)")
+            return
+        run_single_prediction(args.input, args.model_dir)
+    elif args.mode == 'realtime':
+        run_realtime_forecast(args.model_dir)
+    else:
+        print(f"不支持的模式: {args.mode}")
+
+if __name__ == "__main__":
+    main()
--- a/src/models/rainfall_model.py
+++ b/src/models/rainfall_model.py
@ -0,0 +1,176 @@
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+import xgboost as xgb
+from sklearn.ensemble import VotingClassifier
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
+from sklearn.linear_model import Ridge
+import joblib
+import os
+
+class RainfallTwoStageModel:
+    def __init__(self, debug=True, clf_model_type='ensemble', reg_model_type='gbdt', output_dir='figures'):
+        """
+        初始化两阶段降雨预测模型
+        """
+        self.debug = debug
+        self.clf_model_type = clf_model_type
+        self.reg_model_type = reg_model_type
+        self.output_dir = output_dir
+        self.binary_classifier = None
+        self.regressor = None
+        self.category_classifier = None
+        self.threshold = 0.5
+
+    def build_binary_classifier(self):
+        """
+        构建二分类器
+        """
+        if self.clf_model_type == 'ensemble':
+            # 创建多个基础分类器
+            lr = LogisticRegression(max_iter=1000, random_state=42)
+            rf = RandomForestClassifier(n_estimators=100, random_state=42)
+            gbdt = GradientBoostingClassifier(n_estimators=100, random_state=42)
+            xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42)
+            
+            # 创建投票分类器
+            self.binary_classifier = VotingClassifier(
+                estimators=[
+                    ('lr', lr),
+                    ('rf', rf),
+                    ('gbdt', gbdt),
+                    ('xgb', xgb_clf)
+                ],
+                voting='soft'
+            )
+        elif self.clf_model_type == 'lr':
+            self.binary_classifier = LogisticRegression(max_iter=1000, random_state=42)
+        elif self.clf_model_type == 'rf':
+            self.binary_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
+        elif self.clf_model_type == 'gbdt':
+            self.binary_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
+        elif self.clf_model_type == 'xgboost':
+            self.binary_classifier = xgb.XGBClassifier(n_estimators=100, random_state=42)
+        else:
+            raise ValueError(f"不支持的分类器类型: {self.clf_model_type}")
+
+    def build_regressor(self):
+        """
+        构建回归器
+        """
+        if self.reg_model_type == 'rf':
+            self.regressor = RandomForestRegressor(n_estimators=100, random_state=42)
+        elif self.reg_model_type == 'gbdt':
+            self.regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
+        elif self.reg_model_type == 'ridge':
+            self.regressor = Ridge(alpha=1.0)
+        elif self.reg_model_type == 'xgboost':
+            self.regressor = xgb.XGBRegressor(n_estimators=100, random_state=42)
+        else:
+            raise ValueError(f"不支持的回归器类型: {self.reg_model_type}")
+
+    def train_binary_classifier(self, X_train, y_train, X_val=None, y_val=None):
+        """
+        训练二分类器
+        """
+        if self.binary_classifier is None:
+            self.build_binary_classifier()
+        
+        self.binary_classifier.fit(X_train, y_train)
+        
+        # 优化阈值
+        if X_val is not None and y_val is not None:
+            self.optimize_threshold(X_val, y_val)
+
+    def train_regressor(self, X_train, y_train, rain_mask):
+        """
+        训练回归器
+        """
+        if self.regressor is None:
+            self.build_regressor()
+        
+        # 只使用有雨的样本训练回归器
+        X_train_rain = X_train[rain_mask]
+        y_train_rain = y_train[rain_mask]
+        
+        if len(X_train_rain) > 0:
+            self.regressor.fit(X_train_rain, y_train_rain)
+
+    def optimize_threshold(self, X_val, y_val):
+        """
+        优化分类阈值
+        """
+        from sklearn.metrics import f1_score
+        
+        # 获取预测概率
+        y_pred_proba = self.binary_classifier.predict_proba(X_val)[:, 1]
+        
+        # 尝试不同的阈值
+        thresholds = np.arange(0.1, 1.0, 0.1)
+        best_f1 = 0
+        best_threshold = 0.5
+        
+        for threshold in thresholds:
+            y_pred = (y_pred_proba >= threshold).astype(int)
+            f1 = f1_score(y_val, y_pred)
+            
+            if f1 > best_f1:
+                best_f1 = f1
+                best_threshold = threshold
+        
+        self.threshold = best_threshold
+
+    def predict(self, X):
+        """
+        预测降雨情况
+        """
+        # 二分类预测
+        y_pred_proba = self.binary_classifier.predict_proba(X)[:, 1]
+        rain_mask = y_pred_proba >= self.threshold
+        
+        # 回归预测
+        y_pred_amount = np.zeros(len(X))
+        if self.regressor is not None and any(rain_mask):
+            y_pred_amount[rain_mask] = self.regressor.predict(X[rain_mask])
+        
+        return {
+            'rain_probability': y_pred_proba,
+            'rain_binary': rain_mask.astype(int),
+            'rainfall_amount': y_pred_amount
+        }
+
+    def save_model(self, filename='rainfall_two_stage_model.joblib'):
+        """
+        保存模型
+        """
+        model_data = {
+            'binary_classifier': self.binary_classifier,
+            'regressor': self.regressor,
+            'threshold': self.threshold,
+            'clf_model_type': self.clf_model_type,
+            'reg_model_type': self.reg_model_type
+        }
+        
+        joblib.dump(model_data, filename)
+
+    @classmethod
+    def load_model(cls, filename='rainfall_two_stage_model.joblib'):
+        """
+        加载模型
+        """
+        if not os.path.exists(filename):
+            raise FileNotFoundError(f"模型文件不存在: {filename}")
+        
+        model_data = joblib.load(filename)
+        
+        model = cls(
+            clf_model_type=model_data['clf_model_type'],
+            reg_model_type=model_data['reg_model_type']
+        )
+        
+        model.binary_classifier = model_data['binary_classifier']
+        model.regressor = model_data['regressor']
+        model.threshold = model_data['threshold']
+        
+        return model
--- a/src/utils/feature_engineering.py
+++ b/src/utils/feature_engineering.py
@ -0,0 +1,129 @@
+import numpy as np
+import pandas as pd
+
+class FeatureEngineering:
+    @staticmethod
+    def calculate_time_features(df):
+        """
+        计算时间相关特征
+        """
+        df['hour'] = df['datetime'].dt.hour
+        df['day'] = df['datetime'].dt.day
+        df['month'] = df['datetime'].dt.month
+        df['day_of_week'] = df['datetime'].dt.dayofweek
+        return df
+
+    @staticmethod
+    def calculate_change_features(df, variables):
+        """
+        计算变化率特征
+        """
+        for var in variables:
+            # 1小时变化
+            df[f'{var}_1h_change'] = df[var].diff(1)
+            
+            # 3小时变化
+            df[f'{var}_3h_change'] = df[var].diff(3)
+            
+            # 6小时变化
+            df[f'{var}_6h_change'] = df[var].diff(6)
+            
+            # 计算滚动均值
+            df[f'{var}_3h_mean'] = df[var].rolling(window=3, min_periods=1).mean()
+            df[f'{var}_6h_mean'] = df[var].rolling(window=6, min_periods=1).mean()
+            
+            # 滚动标准差
+            df[f'{var}_6h_std'] = df[var].rolling(window=6, min_periods=1).std().fillna(0)
+            
+            # 异常值
+            var_mean = df[var].mean()
+            var_std = df[var].std()
+            df[f'{var}_anomaly'] = (df[var] - var_mean) / var_std if var_std > 0 else 0
+            
+            # 急剧变化指标
+            var_std = df[f'{var}_1h_change'].std()
+            if var_std > 0:
+                df[f'{var}_rapid_change'] = (np.abs(df[f'{var}_1h_change']) > (2 * var_std)).astype(int)
+            else:
+                df[f'{var}_rapid_change'] = 0
+            
+            # 变化加速度
+            df[f'{var}_change_accel'] = df[f'{var}_1h_change'].diff(1).fillna(0)
+        
+        return df
+
+    @staticmethod
+    def calculate_combined_features(df):
+        """
+        计算组合特征
+        """
+        # PWV和湿度相关特征
+        if all(var in df.columns for var in ['PWV', 'rh']):
+            df['PWV_rh'] = df['PWV'] * df['rh'] / 100
+            df['PWV_cumul_change'] = df['PWV'].diff(1).rolling(window=6).sum().fillna(0)
+            df['rh_cumul_change'] = df['rh'].diff(1).rolling(window=6).sum().fillna(0)
+
+        # PWV和温度相关特征
+        if all(var in df.columns for var in ['PWV', 'Temp']):
+            df['PWV_Temp'] = df['PWV'] * df['Temp']
+            temp_exp = np.exp(17.27 * (df['Temp']-273.15) / ((df['Temp']-273.15) + 237.3))
+            df['saturation_index'] = df['PWV'] / (0.1 * temp_exp)
+
+        # 气压相关特征
+        if all(var in df.columns for var in ['Press', 'Press_1h_change']):
+            df['Press_change_intensity'] = df['Press_1h_change'].abs()
+            df['Press_falling'] = (df['Press_1h_change'] < 0).astype(int)
+            press_quantile = df['Press_1h_change'].abs().quantile(0.75)
+            df['Press_rapid_fall'] = ((df['Press_1h_change'] < 0) & 
+                                    (df['Press_1h_change'].abs() > press_quantile)).astype(int)
+
+        # ZTD和PWV相关特征
+        if all(var in df.columns for var in ['ZTD', 'PWV']):
+            df['ZTD_PWV_ratio'] = df['ZTD'] / df['PWV'].replace(0, np.nan).fillna(df['PWV'].mean())
+            if all(var in df.columns for var in ['ZTD_1h_change', 'PWV_1h_change']):
+                df['ZTD_PWV_change'] = df['ZTD_1h_change'] * df['PWV_1h_change']
+            
+            pwv_mean = df['PWV_6h_mean'].mean()
+            if pwv_mean > 0:
+                df['moisture_instability'] = df['PWV_6h_std'] / df['PWV_6h_mean']
+            else:
+                df['moisture_instability'] = 0
+
+        # 降雨潜力指标
+        if all(var in df.columns for var in ['PWV', 'rh', 'Temp', 'Press_1h_change']):
+            df['rain_potential'] = ((df['PWV'] * df['rh'] / 100) * 
+                                  (0.1 - df['Press_1h_change']) / 
+                                  (df['Temp'] + 10))
+            
+            pwv_mean = df['PWV'].mean() + 0.1
+            df['heavy_rain_potential'] = (df['PWV'] * (df['rh'] / 100) * 
+                                        (df['Press_rapid_fall'] + 1) * 
+                                        (1 + np.abs(df['PWV_1h_change']) / 
+                                         pwv_mean))
+            
+            df['rain_persistence'] = (df['PWV_cumul_change'] * 
+                                    df['rh_cumul_change'] * 
+                                    (1 + df['Press_change_intensity']))
+
+        return df
+
+    @staticmethod
+    def process_features(df):
+        """
+        处理所有特征
+        """
+        # 确保datetime列是datetime类型
+        if 'datetime' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+            df['datetime'] = pd.to_datetime(df['datetime'])
+
+        # 计算时间特征
+        df = FeatureEngineering.calculate_time_features(df)
+
+        # 计算变化率特征
+        variables = ['PWV', 'ZTD', 'Temp', 'Press', 'rh']
+        df = FeatureEngineering.calculate_change_features(df, variables)
+
+        # 计算组合特征
+        df = FeatureEngineering.calculate_combined_features(df)
+
+        return df
--- a/src/utils/visualization.py
+++ b/src/utils/visualization.py
@ -0,0 +1,113 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+import os
+
+class Visualization:
+    def __init__(self, output_dir='figures'):
+        """
+        初始化可视化工具
+        """
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+
+    def plot_feature_importance(self, feature_importance, title='特征重要性'):
+        """
+        绘制特征重要性图
+        """
+        plt.figure(figsize=(12, 6))
+        feature_importance.sort_values().plot(kind='barh')
+        plt.title(title)
+        plt.xlabel('重要性')
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'feature_importance.png'))
+        plt.close()
+
+    def plot_rainfall_distribution(self, data, title='降雨量分布'):
+        """
+        绘制降雨量分布图
+        """
+        plt.figure(figsize=(10, 6))
+        sns.histplot(data=data, x='rainfall', bins=50)
+        plt.title(title)
+        plt.xlabel('降雨量 (mm)')
+        plt.ylabel('频次')
+        plt.savefig(os.path.join(self.output_dir, 'rainfall_distribution.png'))
+        plt.close()
+
+    def plot_time_series(self, data, variables, title='时间序列数据'):
+        """
+        绘制时间序列图
+        """
+        plt.figure(figsize=(15, 8))
+        for var in variables:
+            plt.plot(data['datetime'], data[var], label=var)
+        plt.title(title)
+        plt.xlabel('时间')
+        plt.ylabel('值')
+        plt.legend()
+        plt.grid(True)
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'time_series.png'))
+        plt.close()
+
+    def plot_correlation_matrix(self, data, title='特征相关性矩阵'):
+        """
+        绘制相关性矩阵热力图
+        """
+        plt.figure(figsize=(12, 10))
+        correlation_matrix = data.corr()
+        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
+        plt.title(title)
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'correlation_matrix.png'))
+        plt.close()
+
+    def plot_forecast_results(self, forecast_results, title='降雨预报结果'):
+        """
+        绘制预报结果图
+        """
+        time_points = sorted(forecast_results.keys())
+        rain_probs = [results['rain_probability'].mean() for results in forecast_results.values()]
+        rain_amounts = [results['rainfall_amount'].mean() for results in forecast_results.values()]
+
+        # 创建子图
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
+
+        # 绘制降雨概率
+        ax1.plot(time_points, rain_probs, 'b-', label='降雨概率')
+        ax1.fill_between(time_points, 0, rain_probs, alpha=0.2)
+        ax1.set_xlabel('预报时间（小时）')
+        ax1.set_ylabel('降雨概率')
+        ax1.set_title('未来降雨概率预报')
+        ax1.grid(True)
+        ax1.legend()
+
+        # 绘制降雨量
+        ax2.plot(time_points, rain_amounts, 'r-', label='预计降雨量')
+        ax2.fill_between(time_points, 0, rain_amounts, alpha=0.2)
+        ax2.set_xlabel('预报时间（小时）')
+        ax2.set_ylabel('降雨量 (mm)')
+        ax2.set_title('未来降雨量预报')
+        ax2.grid(True)
+        ax2.legend()
+
+        plt.tight_layout()
+        plt.savefig(os.path.join(self.output_dir, 'forecast_results.png'))
+        plt.close()
+
+    def plot_model_evaluation(self, y_true, y_pred, title='模型评估'):
+        """
+        绘制模型评估图
+        """
+        plt.figure(figsize=(10, 6))
+        plt.scatter(y_true, y_pred, alpha=0.5)
+        plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
+        plt.xlabel('实际值')
+        plt.ylabel('预测值')
+        plt.title(title)
+        plt.grid(True)
+        plt.savefig(os.path.join(self.output_dir, 'model_evaluation.png'))
+        plt.close()