1 year ago · 687318a9c8
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,3 @@
 
				-"token": "xx"
			
 
				-"user_agent": "xx"
			
 
				-"base_url": "https://api.yuque.com/api/v2"
			
 
				-"data_path": "data"
			
 
				-
			
 
				+token=xx
			
 
				+cookie=_yuque_session 
			
 
				 
			
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 
				 *.pyc
			
 
				 build/
			
 
				 dist/
			
 
				+download/
			
--- a/README.md
+++ b/README.md
@@ -9,11 +9,15 @@
 
				 
			
 
				 复制文档url，执行如下命令：
			
 
				 ```
			
 
				-python main.py -url https://www.yuque.com/burpheart/phpaudit
			
 
				+python main.py markdown -url https://www.yuque.com/burpheart/phpaudit
			
 
				+
			
 
				+wget https://fileshare.yoqi.me/d/dl/c/Python/crawl_yuque/crawl_yuque
			
 
				+chmod +x crawl_yuque
			
 
				+./crawl_yuque markdown -url https://www.yuque.com/burpheart/phpaudit
			
 
				 
			
 
				-./crawl_yuque -url https://www.yuque.com/burpheart/phpaudit
			
 
				 ```
			
 
				 
			
 
				+私有文档配置 .env 文件，chrome 获取cookie填入即可，登录状态可以看到的项目都可以获取。
			
 
				 
			
 
				 ## 源码分析
			
 
				 
			
@@ -37,3 +41,5 @@ Licensed under the [Apache 2.0](LICENSE) © [liuyuqi.gov@msn.cn](https://github.
 
				 ## Reference
			
 
				 
			
 
				 目前有一些其他语言，如php,node 实现的采集工具，本项目实现的主要用途针对自己的项目，导出markdown文件，方便多平台同步。
			
 
				+
			
 
				+- [gxr404/yuque-dl](https://github.com/gxr404/yuque-dl)
			
--- a/crawl_yuque/__init__.py
+++ b/crawl_yuque/__init__.py
@@ -8,7 +8,38 @@
 
				 '''
			
 
				 
			
 
				 from crawl_yuque.yuque import YuQue
			
 
				+import sys,re,os
			
 
				+from crawl_yuque.options import parser_args
			
 
				 
			
 
				-
			
 
				-def main():
			
 
				-    YuQue().run()
			
 
				+def main(argv=None):
			
 
				+    """Main entry point of the program"""
			
 
				+    try:
			
 
				+        args = parser_args()
			
 
				+        if args.get('version'):
			
 
				+            print("0.0.1")
			
 
				+            sys.exit(0)
			
 
				+        command = args.get('command','')
			
 
				+        if command == '':
			
 
				+            # logging.error("command is empty")
			
 
				+            # argparser.print_help()
			
 
				+            sys.exit(1)
			
 
				+        if command =="serve" or command =="server":
			
 
				+            # from apps import create_app
			
 
				+            # app = create_app()
			
 
				+            # app.run(host='127.0.0.1', port=5000, debug=True)
			
 
				+            return
			
 
				+        if command == "markdown":
			
 
				+            crawl = YuQue(args)
			
 
				+            if(args["url"] != ''):
			
 
				+                url = args["url"]
			
 
				+                crawl.get_book(url=url)
			
 
				+            else:
			
 
				+                url = input("请输入语雀文档链接：")
			
 
				+                crawl.get_book(url=url)
			
 
				+        if command == "help":
			
 
				+            return
			
 
				+        if command == "pdf":
			
 
				+            crawl = YuQue(args)
			
 
				+            crawl.pdf()
			
 
				+    except KeyboardInterrupt:
			
 
				+        sys.exit('\nERROR: Interrupted by user')
			
--- a/crawl_yuque/options.py
+++ b/crawl_yuque/options.py
@@ -13,30 +13,18 @@ import shlex
 
				 import dotenv

			
 
				 from collections import OrderedDict

			
 
				 from .utils.str_util import preferredencoding

			
 
				-

			
 
				+from crawl_yuque.utils.frozen_dir import get_app_path

			
 
				 

			
 
				 def parser_args(overrideArguments=None):

			
 
				     """解析参数"""

			
 
				 

			
 
				     argparser = argparse.ArgumentParser()

			
 
				-    argparser.add_argument('-c', '--config', help='config file', default='config.ini')

			
 
				     argparser.add_argument(

			
 
				         'command',

			
 
				         help='command: ',

			
 
				-        choices=['create', 'clone', 'push', 'delete', 'pull'],

			
 
				-    )

			
 
				-    argparser.add_argument('-d', '--debug', help='debug mode', action='store_true')

			
 
				-    argparser.add_argument(

			
 
				-        '-p',

			
 
				-        '--platform',

			
 
				-        help='set a platform',

			
 
				-        choices=['github', 'gitee', 'gitlab', 'gogs', 'gitea', 'bitbucket', 'coding'],

			
 
				-        default='github',

			
 
				+        choices=['markdown', 'pdf', 'serve', 'version', 'help'],

			
 
				     )

			
 
				-    argparser.add_argument('-token', '--token', help='set a token')

			
 
				-    argparser.add_argument(

			
 
				-        '-repo_path', '--repo_path', help='set a repo'

			
 
				-    )  # , default=os.getcwd())

			
 
				+    argparser.add_argument('-url', '--url', help='please input a url', type=str)

			
 
				     args = argparser.parse_args()

			
 
				 

			
 
				     # remove None

			
@@ -47,13 +35,18 @@ def parser_args(overrideArguments=None):
 
				     system_conf = user_conf = custom_conf = OrderedDict()

			
 
				     user_conf = _read_user_conf()

			
 
				 

			
 
				-    if args.config:

			
 
				-        custom_conf = _read_custom_conf(args.config)

			
 
				+    try:

			
 
				+        if args.config:

			
 
				+            custom_conf = _read_custom_conf(args.config)

			
 
				+    except Exception as e:

			
 
				+        pass

			
 
				 

			
 
				     system_conf.update(user_conf)

			
 
				     system_conf.update(command_line_conf)

			
 
				-    if args.command == None and args.extractor == None:

			
 
				-        raise 'Error, please input cmd and extractor params11'

			
 
				+    app_path = get_app_path()

			
 
				+    system_conf["app_path"] = app_path

			
 
				+    # if args.command == None and args.extractor == None:

			
 
				+    #     raise 'Error, please input cmd and extractor params11'

			
 
				     return system_conf

			
 
				 

			
 
				 

			
@@ -67,7 +60,7 @@ def _read_custom_conf(config_path: str) -> OrderedDict:
 
				 

			
 
				     try:

			
 
				         with open(config_path, 'r', encoding=preferredencoding()) as f:

			
 
				-            contents = f.read()

			
 
				+            contents: str = f.read()

			
 
				             res = compat_shlex_split(contents, comments=True)

			
 
				     except Exception as e:

			
 
				         return []

			
@@ -77,7 +70,7 @@ def _read_custom_conf(config_path: str) -> OrderedDict:
 
				 def _read_user_conf() -> OrderedDict:

			
 
				     """读取用户配置文件: .env 文件"""

			
 
				     user_conf = OrderedDict()

			
 
				-    dotenv_path = '.env'

			
 
				+    dotenv_path = os.path.join(get_app_path(), '.env')

			
 
				     if os.path.exists(dotenv_path):

			
 
				         user_conf = dotenv.dotenv_values(dotenv_path)

			
 
				     return OrderedDict(user_conf)

			
--- a/crawl_yuque/yuque.py
+++ b/crawl_yuque/yuque.py
@@ -19,7 +19,7 @@ from . import api
 
				 class YuQue(object):
			
 
				     ''' 语雀知识库下载 '''
			
 
				     
			
 
				-    def __init__(self):
			
 
				+    def __init__(self, args):
			
 
				         self.sess=requests.Session()
			
 
				 
			
 
				         self.logger = logging.getLogger(__name__)
			
@@ -30,10 +30,10 @@ class YuQue(object):
 
				         self.ch.setFormatter(self.formatter)
			
 
				         self.logger.addHandler(self.ch)
			
 
				         
			
 
				-        self.args = None
			
 
				-        self.parser = argparse.ArgumentParser(description='yuque download')
			
 
				-        self.parser.add_argument('-url', '--url', help='url', default='')
			
 
				-        self.args = self.parser.parse_args()
			
 
				+        # self.args = None
			
 
				+        # self.parser = argparse.ArgumentParser(description='yuque download')
			
 
				+        # self.parser.add_argument('-url', '--url', help='url', default='')
			
 
				+        self.args = args
			
 
				 
			
 
				     def save_page(self, book_id, sulg, path):
			
 
				         ''' 保存文档 '''
			
@@ -48,6 +48,7 @@ class YuQue(object):
 
				 
			
 
				     def get_book(self, url):
			
 
				         ''' 获取知识库 '''
			
 
				+        print("获取知识库 " + url + " download.........")
			
 
				         try:
			
 
				             docsdata = requests.get(url)
			
 
				             data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
			
@@ -60,8 +61,9 @@ class YuQue(object):
 
				         md = ""
			
 
				         table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
			
 
				         prename = ""
			
 
				-        if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
			
 
				-            os.makedirs("download/" + str(docsjson['book']['id']))
			
 
				+        download_dir= os.path.join(self.args["app_path"], "download", str(docsjson['book']['id']))
			
 
				+        if (os.path.exists(download_dir) == False):
			
 
				+            os.makedirs(download_dir)
			
 
				         # 遍历文档
			
 
				         for doc in docsjson['book']['toc']:
			
 
				             # 创建目录
			
@@ -80,8 +82,8 @@ class YuQue(object):
 
				                     else:
			
 
				                         temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
			
 
				                         break
			
 
				-                if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
			
 
				-                    os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
			
 
				+                if ((os.path.exists(f"{download_dir}/" + temp[doc['uuid']])) == False):
			
 
				+                    os.makedirs(f"{download_dir}/" + temp[doc['uuid']])
			
 
				                 if (temp[doc['uuid']].endswith("/")):
			
 
				                     md += "## " + temp[doc['uuid']][:-1] + "\n"
			
 
				                 else:
			
@@ -96,22 +98,18 @@ class YuQue(object):
 
				                         md += "  " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
			
 
				                             temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
			
 
				                     self.save_page(str(docsjson['book']['id']), doc['url'],
			
 
				-                            "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
			
 
				+                            f"{download_dir}/" + temp[doc['parent_uuid']] + "/" + doc[
			
 
				                                 'title'].translate(table) + '.md')
			
 
				                 else:
			
 
				                     md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
			
 
				                         doc['title'].translate(table) + '.md') + ")" + "\n"
			
 
				                     self.save_page(str(docsjson['book']['id']), doc['url'],
			
 
				-                            "download/" + str(docsjson['book']['id']) + "/" + doc[
			
 
				+                            f"{download_dir}/" + doc[
			
 
				                                 'title'].translate(table) + '.md')
			
 
				-        with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
			
 
				+        with open(f"{download_dir}" + "/SUMMARY.md", 'w', encoding='utf-8') as f:
			
 
				             f.write(md)
			
 
				+        print("finish.....")
			
 
				 
			
 
				-    def run(self):
			
 
				-        ''' 获取文档 '''
			
 
				-        if(self.args.url != ''):
			
 
				-            url = self.args.url
			
 
				-            self.get_book(url)
			
 
				-        else:
			
 
				-            url = input("请输入语雀文档链接：")
			
 
				-            self.get_book(url=url)
			
 
				+    def pdf(self):
			
 
				+        """ 生成pdf """
			
 
				+        pass
			
--- a/main.py
+++ b/main.py
@@ -6,8 +6,7 @@
 
				 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				 @Desc    :   enter point
			
 
				 '''
			
 
				-from crawl_yuque import YuQue
			
 
				+from crawl_yuque import main
			
 
				 
			
 
				 if __name__=='__main__':
			
 
				-    yuque = YuQue()
			
 
				-    yuque.run()
			
 
				+    main()