url.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. @Author: hywell
  5. @Email: hywell.28@gmail.com
  6. @Blog: iassas.com
  7. @Date: 2019/10/16 16:31
  8. """
  9. import re
  10. from lib.core.data import logger
  11. from urllib.parse import urlparse
  12. class URL:
  13. def __init__(self, schema: bytes, host: bytes, port, path: bytes,
  14. query: bytes, fragment: bytes, userinfo: bytes):
  15. self.schema = schema.decode('utf-8')
  16. self.host = host.decode('utf-8')
  17. if port and port != 0:
  18. self.port = port
  19. else:
  20. if schema == b'https':
  21. self.port = 443
  22. else:
  23. self.port = 80
  24. self.path = path.decode('utf-8') if path else ''
  25. self.query = query.decode('utf-8') if query else None
  26. self.fragment = fragment.decode('utf-8') if fragment else None
  27. self.userinfo = userinfo.decode('utf-8') if userinfo else None
  28. self.netloc = self.schema + '://' + self.host + ':' + str(self.port)
  29. @property
  30. def raw(self):
  31. return self.netloc + (self.path or '') + (self.query or '') + (self.fragment or '')
  32. def __repr__(self):
  33. return ('<URL schema: {!r}, host: {!r}, port: {!r}, path: {!r}, '
  34. 'query: {!r}, fragment: {!r}, userinfo: {!r}>'
  35. .format(self.schema, self.host, self.port, self.path, self.query, self.fragment, self.userinfo))
  36. def parse_url(url):
  37. try:
  38. parsed = urlparse(url)
  39. userinfo = b'{parsed.username}:{parsed.password}'
  40. return URL(parsed.scheme, parsed.hostname, parsed.port, parsed.path, parsed.query, parsed.fragment, userinfo)
  41. except Exception:
  42. raise ("invalid url {!r}".format(url))
  43. def url_regex(raw):
  44. """""
  45. Collect url
  46. """
  47. urls = []
  48. try:
  49. urls_regex = re.findall(r"((?:https?|ftp|file):\/\/[\-A-Za-z0-9+&@#/%?=~_|!:,.;\*]+[\-A-Za-z0-9+&@#/%=~_|])",
  50. str(raw))
  51. for url in urls_regex:
  52. url_flag = '<a href="' + url + '" target=_blank />' + url + '</a>'
  53. urls.append(url_flag)
  54. except Exception as e:
  55. logger.error(e)
  56. pass
  57. return urls