图床爬虫 - 兔兔博客🇨🇳

兔兔
公告
这里会分享我的日常和学习中的收集、整理及总结，希望能对你有所帮助:) 💖
Learn More
兔兔
公告
这里会分享我的日常和学习中的收集、整理及总结，希望能对你有所帮助:) 💖
Learn More
标签
Worker 刷机教程日本核废水爬虫生活破解
24 字
1 分钟
图床爬虫
2026-02-06
技术教程
爬虫
没什么好说的，又去爬举个栗子的Api了，写了个新脚本
1
import os
2
import requests
3
from bs4 import BeautifulSoup
4
from urllib.parse import urljoin, urlparse
5
from concurrent.futures import ThreadPoolExecutor, as_completed
6
import threading
7
from queue import Queue
8
import time
9

10
class ImageDownloader:
11
    def __init__(self, max_workers=10):
12
        self.base_url = "https://t.alcy.cc/img/"
13
        self.headers = {
14
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
15
        }
16
        self.max_workers = max_workers
17
        self.downloaded_count = 0
18
        self.total_count = 0
19
        self.lock = threading.Lock()
20

21
    def get_categories(self):
22
        """获取所有分类"""
23
        print(f"正在访问主页: {self.base_url}")
24
        try:
25
            response = requests.get(self.base_url, headers=self.headers)
26
            response.raise_for_status()
27
        except Exception as e:
28
            print(f"访问主页失败: {e}")
29
            return []
30

31
        soup = BeautifulSoup(response.text, 'html.parser')
32
        categories = []
33

34
        for a in soup.find_all('a', href=True):
35
            if '(' in a.text and ')' in a.text:
36
                category_name = a.text.split('(')[0].strip()
37
                category_url = urljoin(self.base_url, a['href'])
38
                categories.append({
39
                    'name': category_name,
40
                    'url': category_url
41
                })
42

43
        return categories
44

45
    def get_image_urls(self, category_url):
46
        """获取分类下的所有图片URL"""
47
        try:
48
            response = requests.get(category_url, headers=self.headers)
49
            response.raise_for_status()
50
        except Exception as e:
51
            print(f"获取分类页面失败: {e}")
52
            return []
53

54
        soup = BeautifulSoup(response.text, 'html.parser')
55
        img_tags = soup.find_all('img')
56
        img_urls = []
57

58
        for img in img_tags:
59
            src = img.get('data-src') or img.get('src')
60
            if src and not src.startswith('data:'):
61
                img_urls.append(urljoin(self.base_url, src))
62

63
        # 去重并返回
64
        return list(set(img_urls))
65

66
    def download_single_image(self, img_url, save_path):
67
        """下载单张图片"""
68
        try:
69
            response = requests.get(img_url, headers=self.headers, timeout=15)
70
            response.raise_for_status()
71

72
            # 检查文件是否已存在
73
            if os.path.exists(save_path):
74
                with self.lock:
75
                    self.downloaded_count += 1
76
                return True, "已存在"
77

78
            # 保存图片
79
            with open(save_path, 'wb') as f:
80
                f.write(response.content)
81

82
            with self.lock:
83
                self.downloaded_count += 1
84

85
            return True, "成功"
86

87
        except Exception as e:
88
            return False, str(e)
89

90
    def download_category_images(self, category, img_urls):
91
        """多线程下载一个分类的所有图片"""
92
        category_name = category['name']
93

94
        # 创建分类目录
95
        if not os.path.exists(category_name):
96
            os.makedirs(category_name)
97
            print(f"\n创建目录: {category_name}")
98

99
        print(f"分类 [{category_name}] 共有 {len(img_urls)} 张图片")
100

101
        # 使用ThreadPoolExecutor进行多线程下载
102
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
103
            futures = []
104

105
            for i, img_url in enumerate(img_urls):
106
                # 获取文件名
107
                parsed_url = urlparse(img_url)
108
                filename = os.path.basename(parsed_url.path)
109
                if not filename:
110
                    filename = f"image_{i}.jpg"
111

112
                save_path = os.path.join(category_name, filename)
113

114
                # 提交下载任务
115
                future = executor.submit(self.download_single_image, img_url, save_path)
116
                futures.append((future, filename, i+1))
117

118
            # 显示进度
119
            completed = 0
120
            for future, filename, idx in futures:
121
                try:
122
                    success, message = future.result(timeout=20)
123
                    completed += 1
124

125
                    if success and message == "成功":
126
                        status = "✓"
127
                    elif success and message == "已存在":
128
                        status = "↻"
129
                    else:
130
                        status = "✗"
131

132
                    # 更新进度显示
133
                    progress = f"[{idx}/{len(img_urls)}] {status} {filename}"
134
                    print(f"\r分类 [{category_name}] 进度: {completed}/{len(img_urls)} | 当前: {progress[:50]}", end="")
135

136
                except Exception as e:
137
                    completed += 1
138
                    print(f"\r分类 [{category_name}] 进度: {completed}/{len(img_urls)} | 失败: {filename[:30]}...", end="")
139

140
            print(f"\n分类 [{category_name}] 下载完成")
141

142
    def run(self):
143
        """主运行函数"""
144
        print("=" * 60)
145
        print("开始获取图片分类...")
146
        print("=" * 60)
147

148
        # 获取所有分类
149
        categories = self.get_categories()
150
        if not categories:
151
            print("未找到任何分类。")
152
            return
153

154
        print(f"找到 {len(categories)} 个分类: {[c['name'] for c in categories]}")
155

156
        # 获取所有图片URL
157
        all_img_urls = []
158
        category_data = []
159

160
        for cat in categories:
161
            print(f"正在获取分类 [{cat['name']}] 的图片列表...", end="")
162
            img_urls = self.get_image_urls(cat['url'])
163
            all_img_urls.extend(img_urls)
164
            category_data.append({
165
                'category': cat,
166
                'img_urls': img_urls
167
            })
168
            print(f" 找到 {len(img_urls)} 张图片")
169

170
        self.total_count = len(all_img_urls)
171
        print(f"\n总共发现 {self.total_count} 张图片")
172
        print("=" * 60)
173

174
        # 开始下载
175
        start_time = time.time()
176

177
        for data in category_data:
178
            if data['img_urls']:
179
                self.download_category_images(data['category'], data['img_urls'])
180

181
        # 统计信息
182
        end_time = time.time()
183
        total_time = end_time - start_time
184

185
        print("\n" + "=" * 60)
186
        print("下载完成！")
187
        print("=" * 60)
188
        print(f"总计图片数量: {self.total_count}")
189
        print(f"成功下载/已存在: {self.downloaded_count}")
190
        print(f"总耗时: {total_time:.2f}秒")
191
        print(f"平均速度: {self.downloaded_count/total_time:.2f} 张/秒" if total_time > 0 else "速度计算中...")
192
        print("=" * 60)
193

194
def main():
195
    # 可以调整线程数，根据网络情况和电脑性能
196
    max_workers = 20  # 默认20个线程，可以调整
197

198
    downloader = ImageDownloader(max_workers=max_workers)
199
    downloader.run()
200

201
if __name__ == "__main__":
202
    main()
图床爬虫
https://www.xn--eet944d.top/posts/图床爬虫/
作者
兔兔
发布于
2026-02-06
许可协议
CC BY-NC-SA 4.0
部分信息可能已经过时
2026，新年新气象！