Python3 urllib 使用指南及注意事项

1. urllib 模块概述

Python3 中的 urllib 是一个用于处理 URL 的标准库模块，主要包含以下子模块：

urllib.request - 打开和读取 URL
urllib.error - 包含 urllib.request 抛出的异常
urllib.parse - 解析 URL
urllib.robotparser - 解析 robots.txt 文件

2. 基本使用方法

2.1 发送简单的 GET 请求

import urllib.request
import urllib.parse

# 最基本的 GET 请求
response = urllib.request.urlopen('https://httpbin.org/get')
html = response.read()
print(html.decode('utf-8'))

2.2 发送带参数的 GET 请求

import urllib.request
import urllib.parse

# 构建带参数的 URL
params = {'name': 'John', 'age': 30}
url = 'https://httpbin.org/get?' + urllib.parse.urlencode(params)

response = urllib.request.urlopen(url)
data = response.read()
print(data.decode('utf-8'))

2.3 发送 POST 请求

import urllib.request
import urllib.parse

# POST 请求数据
post_data = urllib.parse.urlencode({
    'username': 'admin',
    'password': 'secret'
}).encode('utf-8')

# 发送 POST 请求
req = urllib.request.Request(
    'https://httpbin.org/post',
    data=post_data,
    method='POST'
)

response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

2.4 设置请求头

import urllib.request

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Accept': 'application/json',
    'Authorization': 'Bearer your_token_here'
}

req = urllib.request.Request('https://httpbin.org/headers')
for key, value in headers.items():
    req.add_header(key, value)

response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

3. 高级功能

3.1 使用代理

import urllib.request

# 设置代理
proxy_handler = urllib.request.ProxyHandler({
    'http': 'http://proxy.example.com:8080',
    'https': 'https://proxy.example.com:8080'
})

opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)

# 现在所有请求都会通过代理
response = urllib.request.urlopen('https://httpbin.org/ip')
print(response.read().decode('utf-8'))

3.2 处理 Cookies

import urllib.request
import http.cookiejar

# 创建 CookieJar 对象
cookie_jar = http.cookiejar.CookieJar()

# 创建 Cookie 处理器
cookie_handler = urllib.request.HTTPCookieProcessor(cookie_jar)

# 创建 opener
opener = urllib.request.build_opener(cookie_handler)

# 安装 opener
urllib.request.install_opener(opener)

# 发送请求（会自动处理 Cookies）
response = urllib.request.urlopen('https://httpbin.org/cookies/set?name=value')

3.3 处理重定向

import urllib.request

# 默认会处理重定向（最多 30 次）
response = urllib.request.urlopen('http://httpbin.org/redirect/2')
print(f"最终 URL: {response.url}")
print(f"状态码: {response.status}")

# 自定义重定向处理器
class NoRedirectHandler(urllib.request.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        # 不处理重定向，直接返回
        return fp

    http_error_301 = http_error_303 = http_error_307 = http_error_302

# 使用自定义重定向处理器
opener = urllib.request.build_opener(NoRedirectHandler())
response = opener.open('http://httpbin.org/redirect/2')
print(f"原始响应状态码: {response.status}")

4. 错误处理

4.1 基本错误处理

import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://httpbin.org/status/404')
    print(response.read().decode('utf-8'))
except urllib.error.HTTPError as e:
    print(f'HTTP 错误: {e.code} - {e.reason}')
except urllib.error.URLError as e:
    print(f'URL 错误: {e.reason}')
except Exception as e:
    print(f'其他错误: {e}')

4.2 设置超时

import urllib.request
import urllib.error
import socket

try:
    # 设置超时为 5 秒
    response = urllib.request.urlopen(
        'https://httpbin.org/delay/10', 
        timeout=5
    )
    print(response.read().decode('utf-8'))
except socket.timeout:
    print('请求超时')
except urllib.error.URLError as e:
    print(f'URL 错误: {e.reason}')

5. URL 解析和处理

5.1 解析 URL

from urllib.parse import urlparse, urlunparse, parse_qs, parse_qsl

# 解析 URL
parsed = urlparse('https://www.example.com/path;params?query=value#fragment')
print(f"协议: {parsed.scheme}")
print(f"域名: {parsed.netloc}")
print(f"路径: {parsed.path}")
print(f"参数: {parsed.params}")
print(f"查询字符串: {parsed.query}")
print(f"片段: {parsed.fragment}")

# 解析查询参数
query_params = parse_qs(parsed.query)
print(f"查询参数: {query_params}")

# 重新构建 URL
new_url = urlunparse(parsed)
print(f"重建的 URL: {new_url}")

5.2 URL 编码和解码

from urllib.parse import quote, unquote, quote_plus, unquote_plus

# 编码
original = "Hello World! 你好"
encoded = quote(original)
print(f"编码后: {encoded}")

# 解码
decoded = unquote(encoded)
print(f"解码后: {decoded}")

# 编码查询参数（将空格转为+）
query_encoded = quote_plus("name=John Doe&city=New York")
print(f"查询编码: {query_encoded}")

6. 重要注意事项

6.1 安全性考虑

import urllib.request
from urllib.parse import urljoin

# 1. 验证 URL（避免 SSRF 攻击）
def is_safe_url(url, allowed_domains):
    parsed = urlparse(url)
    return parsed.netloc in allowed_domains

# 2. 使用 HTTPS
# 尽量使用 HTTPS 而不是 HTTP

# 3. 避免直接拼接 URL（防止开放重定向）
base = 'https://example.com/'
relative = '../../evil.com'
safe_url = urljoin(base, relative)
print(f"安全拼接的 URL: {safe_url}")

6.2 性能优化

import urllib.request
import gzip
import io

# 1. 使用连接池
# urllib 默认会重用连接，无需特殊配置

# 2. 处理压缩内容
req = urllib.request.Request('https://httpbin.org/gzip')
req.add_header('Accept-Encoding', 'gzip')

response = urllib.request.urlopen(req)
content_encoding = response.headers.get('Content-Encoding')

if content_encoding == 'gzip':
    # 解压 gzip 内容
    bio = io.BytesIO(response.read())
    f = gzip.GzipFile(fileobj=bio)
    content = f.read().decode('utf-8')
    print(content[:500])
else:
    content = response.read().decode('utf-8')
    print(content[:500])

6.3 最佳实践

import urllib.request
import json
from typing import Optional, Dict, Any

class SafeHTTPClient:
    """安全的 HTTP 客户端封装"""

    def __init__(self, timeout: int = 10, user_agent: Optional[str] = None):
        self.timeout = timeout
        self.headers = {}

        if user_agent:
            self.headers['User-Agent'] = user_agent
        else:
            self.headers['User-Agent'] = 'MyApp/1.0'

    def get_json(self, url: str, params: Optional[Dict] = None) -> Any:
        """安全地获取 JSON 数据"""
        try:
            # 构建完整 URL
            from urllib.parse import urlencode, urljoin
            if params:
                url = urljoin(url, '?' + urlencode(params))

            # 创建请求
            req = urllib.request.Request(url, headers=self.headers)

            # 发送请求
            with urllib.request.urlopen(req, timeout=self.timeout) as response:
                if response.status == 200:
                    content_type = response.headers.get('Content-Type', '')
                    if 'application/json' in content_type:
                        data = response.read()
                        return json.loads(data.decode('utf-8'))
                    else:
                        raise ValueError(f"不支持的 Content-Type: {content_type}")
                else:
                    raise Exception(f"HTTP 错误: {response.status}")

        except Exception as e:
            print(f"请求失败: {e}")
            return None

# 使用示例
client = SafeHTTPClient(timeout=5)
result = client.get_json('https://httpbin.org/json')
if result:
    print(result)

7. 常见问题解决方案

7.1 SSL 证书验证问题

import ssl
import urllib.request

# 方法1：创建不验证证书的上下文（不安全，仅用于测试）
unsafe_context = ssl._create_unverified_context()
response = urllib.request.urlopen(
    'https://expired.badssl.com/', 
    context=unsafe_context
)

# 方法2：指定自定义 CA 证书（推荐）
import certifi
import ssl

# 使用 certifi 提供的证书
ssl_context = ssl.create_default_context(cafile=certifi.where())
response = urllib.request.urlopen(
    'https://httpbin.org/get',
    context=ssl_context
)

7.2 处理大文件下载

import urllib.request

def download_large_file(url, save_path, chunk_size=8192):
    """分块下载大文件"""
    req = urllib.request.Request(url)

    with urllib.request.urlopen(req) as response:
        total_size = int(response.headers.get('Content-Length', 0))
        downloaded = 0

        with open(save_path, 'wb') as f:
            while True:
                chunk = response.read(chunk_size)
                if not chunk:
                    break
                f.write(chunk)
                downloaded += len(chunk)

                # 显示进度
                if total_size > 0:
                    percent = (downloaded / total_size) * 100
                    print(f"\r下载进度: {percent:.1f}%", end='')

    print(f"\n文件已保存到: {save_path}")

8. 与其他库的对比

urllib vs requests

特性	urllib	requests
易用性	较低	高
功能	基础	丰富
依赖	Python 内置	需要安装
性能	较好	良好
文档	官方文档	详细且友好

何时使用 urllib：

不想安装额外依赖
只需要基本 HTTP 功能
在受限环境中运行
学习 HTTP 协议基础

何时使用 requests：

需要更简洁的 API
需要高级功能（如会话、OAuth 等）
项目允许第三方依赖
开发生产级应用

总结

Python3 的 urllib 是一个功能强大但较低级别的 HTTP 客户端库。虽然它的 API 不如 requests 简洁，但对于理解 HTTP 协议底层工作原理和学习网络编程非常有帮助。在实际项目中，根据具体需求选择合适的工具，urllib 适合简单的、无需额外依赖的场景，而 requests 则更适合复杂的生产环境。