Mario

解决 RSSHub 订阅微博无法显示图片备忘

最近使用 RSSHub 订阅一些微博博主,每天早上发送一封邮件到邮箱里,结果发现一个问题:如果微博内容里包含图片,图片是无法被正确展示出来的,显示裂开,具体错误是 404 Forbidden。

研究了一番后,发现是因为空 Referer 会被微博拦截,最终解决方案记录如下:

  1. 部署一个 python 服务,用来作为 proxy 发起图片 url 请求
  2. 打开 RSSHub 的 HOTLINK 功能 HOTLINK_INCLUDE_PATHS=/weibo ,将 1 部署的链接设置到 HOTLINK_TEMPLATE=https://image-proxy.xxx.com/image?url=${href_ue}

具体代码如下:

from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import requests
import os
import urllib.parse
from cachetools import LRUCache, TTLCache
from typing import Any
import httpx


# 创建一个LRU缓存实例,例如:缓存100个最近使用的条目,并且每个条目在5分钟内有效
cache = TTLCache(maxsize=100, ttl=5 * 60)
app = FastAPI()

@app.get('/image')
async def proxy_request(request: Request):
    query_string = request.url.query
    # 解码原始查询字符串以正确处理转义的 &
    query_params = urllib.parse.parse_qs(query_string)

    url_param = query_params.get('url', [''])[0]
    # 尝试从缓存中获取结果
    cached_response = cache.get(url_param)
    if cached_response is not None:
        headers, content = cached_response
        return StreamingResponse(iter([content]), status_code=200, headers=headers)

    referer_env = os.environ.get('DEFAULT_REFERER', 'https://weibo.com')
    referer_param = query_params.get('referer', [referer_env])[0]

    print(f"Proxying request to URL: {url_param}")
    print(f"Referer: {referer_param}")

    user_agent_env = os.environ.get('USER_AGENT_HEADER')
    if user_agent_env is None:
        user_agent_header = request.headers.get('user-agent', '')
    else:
        user_agent_header = user_agent_env

    proxy_uri = os.environ.get('PROXY_URI', None)

    async with httpx.AsyncClient(proxy=proxy_uri) as client:
        response = await client.get(url_param, headers={'referer': referer_param, 'user-agent': user_agent_header})
        # 确保完整读取response.content
        response_content = response.content
        content_length = len(response_content)

        headers = response.headers.copy()
        headers['Content-Length'] = str(content_length)  # 更新Content-Length为实际长度
        # 只缓存HTTP状态码为200的响应
        if response.status_code == 200:
            cache[url_param] = (headers, response_content)

        return StreamingResponse(
            iter([response_content]),
            status_code=response.status_code,
            headers=headers,
        )

if __name__ == '__main__':
    import uvicorn
    PORT = 8080
    uvicorn.run(app, host='0.0.0.0', port=PORT)
← back