应用领域拓展:机器学习、数据库与软件工程

【免费下载链接】open-source-cs Video discussing this curriculum: 【免费下载链接】open-source-cs 项目地址: https://gitcode.com/gh_mirrors/op/open-source-cs

文章概要介绍了斯坦福大学CS229机器学习课程的核心内容与实践指南,包括线性回归、逻辑回归、神经网络等关键算法的代码实现,以及机器学习工作流程和最佳实践。同时涵盖了数据库管理系统与SQL的深入学习,包括SQL语言体系、事务管理、性能优化策略和现代数据库发展趋势。此外,还详细阐述了软件工程原理与最佳实践,包括SOLID原则、代码审查流程、测试驱动开发、持续集成与持续部署,以及架构设计原则和DevOps文化。

斯坦福机器学习课程实战指南

斯坦福大学的CS229机器学习课程由人工智能领域的先驱Andrew Ng教授创立,是全球最负盛名的机器学习课程之一。这门课程不仅提供了坚实的理论基础,更重要的是教会学生如何将机器学习算法应用于实际问题解决。本指南将带你深入探索CS229课程的核心内容,并通过丰富的代码示例和实战项目,帮助你真正掌握机器学习的实践技能。

课程核心架构与学习路径

CS229课程采用系统化的教学方法,从基础数学概念到高级算法实现,构建了完整的学习体系:

mermaid

关键算法实现与代码示例

1. 线性回归从零实现

线性回归是机器学习的基础,理解其数学原理和实现细节至关重要:

import numpy as np
import matplotlib.pyplot as plt

class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.loss_history = []
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iterations):
            y_pred = np.dot(X, self.weights) + self.bias
            
            # 计算梯度
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)
            
            # 更新参数
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # 记录损失
            loss = np.mean((y_pred - y)**2)
            self.loss_history.append(loss)
    
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# 使用示例
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.dot(X, np.array([1, 2])) + 3
model = LinearRegression(learning_rate=0.01, n_iterations=1000)
model.fit(X, y)
predictions = model.predict(X)
2. 逻辑回归与分类问题

逻辑回归是二分类问题的核心算法,以下是完整实现:

import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
    
    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear_model)
            
            # 计算梯度
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)
            
            # 更新参数
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X, threshold=0.5):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear_model)
        return (y_pred >= threshold).astype(int)

# 乳腺癌数据集分类示例
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 加载数据
data = load_breast_cancer()
X, y = data.data, data.target

# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 训练模型
model = LogisticRegression(learning_rate=0.1, n_iterations=3000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

实战项目:手写数字识别

使用神经网络实现MNIST手写数字识别,这是CS229课程的经典项目:

import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

class NeuralNetwork:
    def __init__(self, layer_sizes, learning_rate=0.1):
        self.layer_sizes = layer_sizes
        self.learning_rate = learning_rate
        self.parameters = {}
        self.initialize_parameters()
    
    def initialize_parameters(self):
        for l in range(1, len(self.layer_sizes)):
            self.parameters[f'W{l}'] = np.random.randn(
                self.layer_sizes[l], self.layer_sizes[l-1]) * 0.01
            self.parameters[f'b{l}'] = np.zeros((self.layer_sizes[l], 1))
    
    def relu(self, Z):
        return np.maximum(0, Z)
    
    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z))
        return expZ / expZ.sum(axis=0, keepdims=True)
    
    def forward_propagation(self, X):
        cache = {'A0': X}
        L = len(self.parameters) // 2
        
        for l in range(1, L):
            Z = np.dot(self.parameters[f'W{l}'], cache[f'A{l-1}']) + self.parameters[f'b{l}']
            cache[f'Z{l}'] = Z
            cache[f'A{l}'] = self.relu(Z)
        
        ZL = np.dot(self.parameters[f'W{L}'], cache[f'A{L-1}']) + self.parameters[f'b{L}']
        cache[f'Z{L}'] = ZL
        cache[f'A{L}'] = self.softmax(ZL)
        
        return cache
    
    def compute_cost(self, AL, Y):
        m = Y.shape[1]
        cost = -np.sum(Y * np.log(AL + 1e-8)) / m
        return cost
    
    def backward_propagation(self, cache, Y):
        grads = {}
        m = Y.shape[1]
        L = len(self.parameters) // 2
        
        dZL = cache[f'A{L}'] - Y
        grads[f'dW{L}'] = np.dot(dZL, cache[f'A{L-1}'].T) / m
        grads[f'db{L}'] = np.sum(dZL, axis=1, keepdims=True) / m
        
        for l in reversed(range(1, L)):
            dA_prev = np.dot(self.parameters[f'W{l+1}'].T, dZL)
            dZ = dA_prev * (cache[f'Z{l}'] > 0).astype(float)
            grads[f'dW{l}'] = np.dot(dZ, cache[f'A{l-1}'].T) / m
            grads[f'db{l}'] = np.sum(dZ, axis=1, keepdims=True) / m
            dZL = dZ
        
        return grads
    
    def update_parameters(self, grads):
        L = len(self.parameters) // 2
        for l in range(1, L+1):
            self.parameters[f'W{l}'] -= self.learning_rate * grads[f'dW{l}']
            self.parameters[f'b{l}'] -= self.learning_rate * grads[f'db{l}']
    
    def train(self, X, Y, iterations):
        for i in range(iterations):
            cache = self.forward_propagation(X)
            cost = self.compute_cost(cache[f'A{len(self.layer_sizes)-1}'], Y)
            grads = self.backward_propagation(cache, Y)
            self.update_parameters(grads)
            
            if i % 100 == 0:
                print(f"迭代 {i}, 损失: {cost}")

# 加载MNIST数据集
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data.T / 255.0, mnist.target.astype(int)

# 数据预处理
encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(y.reshape(-1, 1)).T
X_train, X_test, y_train, y_test = train_test_split(X.T, y, test_size=0.2, random_state=42)

# 创建神经网络
nn = NeuralNetwork([784, 128, 64, 10], learning_rate=0.1)
nn.train(X_train.T, encoder.transform(y_train.reshape(-1, 1)).T, 1000)

机器学习工作流程与最佳实践

CS229课程强调的系统化机器学习工作流程:

mermaid

特征工程技术对比
技术类型 方法 适用场景 优点 缺点
数值标准化 StandardScaler 基于距离的算法 消除量纲影响 对异常值敏感
归一化 MinMaxScaler 神经网络 数据范围统一 受极端值影响
独热编码 OneHotEncoder 分类变量 避免数值大小误解 维度灾难
特征选择 SelectKBest 高维数据 减少过拟合 可能丢失信息
PCA降维 PCA 数据可视化 去除相关性 可解释性降低

模型评估与超参数优化

掌握正确的模型评估方法是机器学习实践的关键:

from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} 评估结果 ===")
    print(classification_report(y_true, y_pred))
    
    # 绘制混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - 混淆矩阵')
    plt.ylabel('真实标签')
    plt.xlabel('预测标签')
    plt.show()
    
    return classification_report(y_true, y_pred, output_dict=True)

# 超参数优化示例
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)
print("最佳得分:", grid_search.best_score_)

实际应用案例:房价预测系统

结合CS229课程知识,构建完整的房价预测机器学习系统:

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import joblib

class HousePricePredictor:
    def __init__(self):
        self.model = GradientBoostingRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42
        )
        self.scaler = StandardScaler()
        self.is_trained = False
    
    def load_and_preprocess_data(self):
        """加载和预处理加州房价数据集"""
        housing = fetch_california_housing()
        X, y = housing.data, housing.target
        
        # 添加多项式特征
        from sklearn.preprocessing import PolynomialFeatures
        poly = PolynomialFeatures(degree=2, include_bias=False)
        X_poly = poly.fit_transform(X)
        
        # 数据标准化
        X_scaled = self.scaler.fit_transform(X_poly)
        
        return X_scaled, y, housing.feature_names
    
    def train_model(self, X, y):
        """训练模型并进行交叉验证"""
        # 交叉验证
        cv_scores = cross_val_score(self.model, X, y, cv=5, scoring='r2')
        print(f"交叉验证R²分数: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")
        
        # 训练最终模型
        self.model.fit(X, y)
        self.is_trained = True
        
        return self.model
    
    def evaluate_model(self, X_test, y_test):
        """评估模型性能"""
        if not self.is_trained:
            raise ValueError("模型尚未训练")
        
        y_pred = self.model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"测试集MSE: {mse:.3f}")
        print(f"测试集R²: {r2:.3f}")
        
        return mse, r2
    
    def predict_price(self, features):
        """预测房价"""
        if not self.is_trained:
            raise ValueError("模型尚未训练")
        
        # 预处理输入特征
        features = np.array(features).reshape(1, -1)
        features_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(features)
        features_scaled = self.scaler.transform(features_poly)
        
        prediction = self.model.predict(features_scaled)
        return prediction[0]
    
    def save_model(self, filepath):
        """保存训练好的模型"""
        joblib.dump({
            'model': self.model,
            'scaler': self.scaler
        }, filepath)
    
    def load_model(self, filepath):
        """加载已训练的模型"""
        saved_data = joblib.load(filepath)
        self.model = saved_data['model']
        self.scaler = saved_data['scaler']
        self.is_trained = True

# 使用示例
predictor = HousePricePredictor()
X, y, feature_names = predictor.load_and_preprocess_data()

# 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
predictor.train_model(X_train, y_train)

# 评估模型
predictor.evaluate_model(X_test, y_test)

# 进行预测
sample_house = [8.3252, 41.0, 6.984127, 1.023810, 322.0, 2.555556, 37.88, -122.23]
predicted_price = predictor.predict_price(sample_house)
print(f"预测房价: ${predicted_price * 100000:.2f}")

通过这个完整的实战指南,你不仅能够理解斯坦福CS229课程的理论精髓,更重要的是掌握了将机器学习算法应用于实际问题的完整技能栈。从基础的线性回归到复杂的神经网络,从模型训练到部署应用,每一个环节都配备了详细的代码示例和最佳实践建议。

数据库管理系统与SQL深入学习

在现代数据驱动的世界中,数据库管理系统(DBMS)和SQL语言构成了数据存储、管理和分析的核心技术栈。作为计算机科学教育的重要组成部分,深入理解数据库系统不仅涉及基础的CRUD操作,更需要掌握高级的SQL特性、事务管理、性能优化以及系统架构设计。

数据库系统架构与核心组件

数据库管理系统是一个复杂的软件系统,由多个协同工作的组件构成,每个组件都承担着特定的职责:

mermaid

数据库系统的核心组件包括:

  • 硬件层:物理存储设备、服务器和网络基础设施
  • 软件层:数据库引擎、操作系统接口和网络通信模块
  • 数据层:结构化数据存储和元数据管理系统
  • 过程层:操作流程、备份恢复和安全策略

【免费下载链接】open-source-cs Video discussing this curriculum: 【免费下载链接】open-source-cs 项目地址: https://gitcode.com/gh_mirrors/op/open-source-cs

Logo

惟楚有才,于斯为盛。欢迎来到长沙!!! 茶颜悦色、臭豆腐、CSDN和你一个都不能少~

更多推荐