Browse Source

Merge branch 'release/0.3.0'

liuyuqi-dellpc 7 months ago
parent
commit
687318a9c8
7 changed files with 79 additions and 54 deletions
  1. 2 5
      .env.example
  2. 1 0
      .gitignore
  3. 8 2
      README.md
  4. 34 3
      crawl_yuque/__init__.py
  5. 14 21
      crawl_yuque/options.py
  6. 18 20
      crawl_yuque/yuque.py
  7. 2 3
      main.py

+ 2 - 5
.env.example

@@ -1,6 +1,3 @@
-"token": "xx"
-"user_agent": "xx"
-"base_url": "https://api.yuque.com/api/v2"
-"data_path": "data"
-
+token=xx
+cookie=_yuque_session 
 

+ 1 - 0
.gitignore

@@ -1,3 +1,4 @@
 *.pyc
 build/
 dist/
+download/

+ 8 - 2
README.md

@@ -9,11 +9,15 @@
 
 复制文档url,执行如下命令:
 ```
-python main.py -url https://www.yuque.com/burpheart/phpaudit
+python main.py markdown -url https://www.yuque.com/burpheart/phpaudit
+
+wget https://fileshare.yoqi.me/d/dl/c/Python/crawl_yuque/crawl_yuque
+chmod +x crawl_yuque
+./crawl_yuque markdown -url https://www.yuque.com/burpheart/phpaudit
 
-./crawl_yuque -url https://www.yuque.com/burpheart/phpaudit
 ```
 
+私有文档配置 .env 文件,chrome 获取cookie填入即可,登录状态可以看到的项目都可以获取。
 
 ## 源码分析
 
@@ -37,3 +41,5 @@ Licensed under the [Apache 2.0](LICENSE) © [liuyuqi.gov@msn.cn](https://github.
 ## Reference
 
 目前有一些其他语言,如php,node 实现的采集工具,本项目实现的主要用途针对自己的项目,导出markdown文件,方便多平台同步。
+
+- [gxr404/yuque-dl](https://github.com/gxr404/yuque-dl)

+ 34 - 3
crawl_yuque/__init__.py

@@ -8,7 +8,38 @@
 '''
 
 from crawl_yuque.yuque import YuQue
+import sys,re,os
+from crawl_yuque.options import parser_args
 
-
-def main():
-    YuQue().run()
+def main(argv=None):
+    """Main entry point of the program"""
+    try:
+        args = parser_args()
+        if args.get('version'):
+            print("0.0.1")
+            sys.exit(0)
+        command = args.get('command','')
+        if command == '':
+            # logging.error("command is empty")
+            # argparser.print_help()
+            sys.exit(1)
+        if command =="serve" or command =="server":
+            # from apps import create_app
+            # app = create_app()
+            # app.run(host='127.0.0.1', port=5000, debug=True)
+            return
+        if command == "markdown":
+            crawl = YuQue(args)
+            if(args["url"] != ''):
+                url = args["url"]
+                crawl.get_book(url=url)
+            else:
+                url = input("请输入语雀文档链接:")
+                crawl.get_book(url=url)
+        if command == "help":
+            return
+        if command == "pdf":
+            crawl = YuQue(args)
+            crawl.pdf()
+    except KeyboardInterrupt:
+        sys.exit('\nERROR: Interrupted by user')

+ 14 - 21
crawl_yuque/options.py

@@ -13,30 +13,18 @@ import shlex
 import dotenv
 from collections import OrderedDict
 from .utils.str_util import preferredencoding
-
+from crawl_yuque.utils.frozen_dir import get_app_path
 
 def parser_args(overrideArguments=None):
     """解析参数"""
 
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('-c', '--config', help='config file', default='config.ini')
     argparser.add_argument(
         'command',
         help='command: ',
-        choices=['create', 'clone', 'push', 'delete', 'pull'],
-    )
-    argparser.add_argument('-d', '--debug', help='debug mode', action='store_true')
-    argparser.add_argument(
-        '-p',
-        '--platform',
-        help='set a platform',
-        choices=['github', 'gitee', 'gitlab', 'gogs', 'gitea', 'bitbucket', 'coding'],
-        default='github',
+        choices=['markdown', 'pdf', 'serve', 'version', 'help'],
     )
-    argparser.add_argument('-token', '--token', help='set a token')
-    argparser.add_argument(
-        '-repo_path', '--repo_path', help='set a repo'
-    )  # , default=os.getcwd())
+    argparser.add_argument('-url', '--url', help='please input a url', type=str)
     args = argparser.parse_args()
 
     # remove None
@@ -47,13 +35,18 @@ def parser_args(overrideArguments=None):
     system_conf = user_conf = custom_conf = OrderedDict()
     user_conf = _read_user_conf()
 
-    if args.config:
-        custom_conf = _read_custom_conf(args.config)
+    try:
+        if args.config:
+            custom_conf = _read_custom_conf(args.config)
+    except Exception as e:
+        pass
 
     system_conf.update(user_conf)
     system_conf.update(command_line_conf)
-    if args.command == None and args.extractor == None:
-        raise 'Error, please input cmd and extractor params11'
+    app_path = get_app_path()
+    system_conf["app_path"] = app_path
+    # if args.command == None and args.extractor == None:
+    #     raise 'Error, please input cmd and extractor params11'
     return system_conf
 
 
@@ -67,7 +60,7 @@ def _read_custom_conf(config_path: str) -> OrderedDict:
 
     try:
         with open(config_path, 'r', encoding=preferredencoding()) as f:
-            contents = f.read()
+            contents: str = f.read()
             res = compat_shlex_split(contents, comments=True)
     except Exception as e:
         return []
@@ -77,7 +70,7 @@ def _read_custom_conf(config_path: str) -> OrderedDict:
 def _read_user_conf() -> OrderedDict:
     """读取用户配置文件: .env 文件"""
     user_conf = OrderedDict()
-    dotenv_path = '.env'
+    dotenv_path = os.path.join(get_app_path(), '.env')
     if os.path.exists(dotenv_path):
         user_conf = dotenv.dotenv_values(dotenv_path)
     return OrderedDict(user_conf)

+ 18 - 20
crawl_yuque/yuque.py

@@ -19,7 +19,7 @@ from . import api
 class YuQue(object):
     ''' 语雀知识库下载 '''
     
-    def __init__(self):
+    def __init__(self, args):
         self.sess=requests.Session()
 
         self.logger = logging.getLogger(__name__)
@@ -30,10 +30,10 @@ class YuQue(object):
         self.ch.setFormatter(self.formatter)
         self.logger.addHandler(self.ch)
         
-        self.args = None
-        self.parser = argparse.ArgumentParser(description='yuque download')
-        self.parser.add_argument('-url', '--url', help='url', default='')
-        self.args = self.parser.parse_args()
+        # self.args = None
+        # self.parser = argparse.ArgumentParser(description='yuque download')
+        # self.parser.add_argument('-url', '--url', help='url', default='')
+        self.args = args
 
     def save_page(self, book_id, sulg, path):
         ''' 保存文档 '''
@@ -48,6 +48,7 @@ class YuQue(object):
 
     def get_book(self, url):
         ''' 获取知识库 '''
+        print("获取知识库 " + url + " download.........")
         try:
             docsdata = requests.get(url)
             data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
@@ -60,8 +61,9 @@ class YuQue(object):
         md = ""
         table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
         prename = ""
-        if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
-            os.makedirs("download/" + str(docsjson['book']['id']))
+        download_dir= os.path.join(self.args["app_path"], "download", str(docsjson['book']['id']))
+        if (os.path.exists(download_dir) == False):
+            os.makedirs(download_dir)
         # 遍历文档
         for doc in docsjson['book']['toc']:
             # 创建目录
@@ -80,8 +82,8 @@ class YuQue(object):
                     else:
                         temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
                         break
-                if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
-                    os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
+                if ((os.path.exists(f"{download_dir}/" + temp[doc['uuid']])) == False):
+                    os.makedirs(f"{download_dir}/" + temp[doc['uuid']])
                 if (temp[doc['uuid']].endswith("/")):
                     md += "## " + temp[doc['uuid']][:-1] + "\n"
                 else:
@@ -96,22 +98,18 @@ class YuQue(object):
                         md += "  " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
                             temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
                     self.save_page(str(docsjson['book']['id']), doc['url'],
-                            "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
+                            f"{download_dir}/" + temp[doc['parent_uuid']] + "/" + doc[
                                 'title'].translate(table) + '.md')
                 else:
                     md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
                         doc['title'].translate(table) + '.md') + ")" + "\n"
                     self.save_page(str(docsjson['book']['id']), doc['url'],
-                            "download/" + str(docsjson['book']['id']) + "/" + doc[
+                            f"{download_dir}/" + doc[
                                 'title'].translate(table) + '.md')
-        with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
+        with open(f"{download_dir}" + "/SUMMARY.md", 'w', encoding='utf-8') as f:
             f.write(md)
+        print("finish.....")
 
-    def run(self):
-        ''' 获取文档 '''
-        if(self.args.url != ''):
-            url = self.args.url
-            self.get_book(url)
-        else:
-            url = input("请输入语雀文档链接:")
-            self.get_book(url=url)
+    def pdf(self):
+        """ 生成pdf """
+        pass

+ 2 - 3
main.py

@@ -6,8 +6,7 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   enter point
 '''
-from crawl_yuque import YuQue
+from crawl_yuque import main
 
 if __name__=='__main__':
-    yuque = YuQue()
-    yuque.run()
+    main()