Browse Source

refactor: improve data handling and directory management in emotions crawler

liuyuqi-dellpc 3 days ago
parent
commit
5a8a80aeee
3 changed files with 28 additions and 12 deletions
  1. 5 2
      .env.example
  2. 11 0
      README.md
  3. 12 10
      crawl_emotions/emotions.py

+ 5 - 2
.env.example

@@ -1,2 +1,5 @@
-# 服务器配置
-access_key=xx
+
+page_start=1
+page_end=4328
+
+threads=1

+ 11 - 0
README.md

@@ -2,6 +2,17 @@
 
 表情包爬取工具,速度不能快,会触发 Cloudflare 限制。
 
+
+表情来源于: https://fabiaoqing.com/
+
+
+## Develop
+
+先配置 .env
+```
+python main.py
+```
+
 ## License
 
 

+ 12 - 10
crawl_emotions/emotions.py

@@ -18,16 +18,19 @@ class Emotions(object):
     header= {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.142.86 Safari/537.36"
     }
-    def __init__(self):
+    def __init__(self,  params: dict, debug=False):
+        self.params = params
         self.sess = httpx.Client(headers=self.header)
-        self.path = os.path.dirname(os.path.abspath(__file__))
-        if not os.path.exists(self.path+'/data'):
-            os.mkdir(self.path+'/data')
-        self.data_path = self.path+'/data/'
-        self.pool = ThreadPoolExecutor(2)
+        self.app_path=params["app_path"]
+        if not os.path.exists(self.app_path+'/data'):
+            os.mkdir(self.app_path+'/data')
+        self.data_path = self.app_path+'/data/'
+        self.pool = ThreadPoolExecutor(params['threads'])
 
     def run(self):
-        for i in range(1, 4328+1):
+        page_start=self.params["page_start"]
+        page_end=self.params["page_end"]
+        for i in range(page_start, page_end+1):
             url = self._url.format(page=i)
             self.pool.submit(self.get_page, url)
 
@@ -48,6 +51,5 @@ class Emotions(object):
                 break
         print('下载完毕: ', url)
     
-        def __del__(self):
-            self.pool.shutdown(wait=True)
-            
+    def __del__(self):
+        self.pool.shutdown(wait=True)