fuck12306.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #!/usr/bin/python
  2. # # FileName : fuck12306.py
  3. # # Author : MaoMao Wang <andelf@gmail.com>
  4. # # Created : Mon Mar 16 22:08:41 2015 by ShuYu Wang
  5. # # Copyright : Feather (c) 2015
  6. # # Description : fuck fuck 12306
  7. # # Time-stamp: <2016-04-10 16:28:41 andelf>
  8. import re
  9. # hack CERTIFICATE_VERIFY_FAILED
  10. # https://github.com/mtschirs/quizduellapi/issues/2
  11. import ssl
  12. import urllib
  13. import requests
  14. from PIL import Image
  15. from PIL import ImageFilter
  16. if hasattr(ssl, '_create_unverified_context'):
  17. ssl._create_default_https_context = ssl._create_unverified_context
  18. UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"
  19. pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197"
  20. def get_img():
  21. resp = urllib.urlopen(pic_url)
  22. raw = resp.read()
  23. with open("./tmp.jpg", 'wb') as fp:
  24. fp.write(raw)
  25. return Image.open("./tmp.jpg")
  26. def get_sub_img(im, x, y):
  27. assert 0 <= x <= 3
  28. assert 0 <= y <= 2
  29. WITH = HEIGHT = 68
  30. left = 5 + (67 + 5) * x
  31. top = 41 + (67 + 5) * y
  32. right = left + 67
  33. bottom = top + 67
  34. return im.crop((left, top, right, bottom))
  35. def baidu_image_upload(im):
  36. url = "http://image.baidu.com/pictureup/uploadshitu?fr=flash&fm=index&pos=upload"
  37. im.save("./query_temp_img.png")
  38. raw = open("./query_temp_img.png", 'rb').read()
  39. files = {
  40. 'fileheight': "0",
  41. 'newfilesize': str(len(raw)),
  42. 'compresstime': "0",
  43. 'Filename': "image.png",
  44. 'filewidth': "0",
  45. 'filesize': str(len(raw)),
  46. 'filetype': 'image/png',
  47. 'Upload': "Submit Query",
  48. 'filedata': ("image.png", raw)
  49. }
  50. resp = requests.post(url, files=files, headers={'User-Agent': UA})
  51. # resp.url
  52. redirect_url = "http://image.baidu.com" + resp.text
  53. return redirect_url
  54. def baidu_stu_lookup(im):
  55. redirect_url = baidu_image_upload(im)
  56. # print redirect_url
  57. resp = requests.get(redirect_url)
  58. html = resp.text
  59. return baidu_stu_html_extract(html)
  60. def baidu_stu_html_extract(html):
  61. pattern = re.compile(r"'multitags':\s*'(.*?)'")
  62. matches = pattern.findall(html)
  63. if not matches:
  64. return '[ERROR?]'
  65. tags_str = matches[0]
  66. result = list(filter(None, tags_str.replace('\t', ' ').split()))
  67. return '|'.join(result) if result else '[UNKOWN]'
  68. def ocr_question_extract(im):
  69. # git@github.com:madmaze/pytesseract.git
  70. global pytesseract
  71. try:
  72. import pytesseract
  73. except:
  74. print
  75. "[ERROR] pytesseract not installed"
  76. return
  77. im = im.crop((127, 3, 260, 22))
  78. im = pre_ocr_processing(im)
  79. # im.show()
  80. return pytesseract.image_to_string(im, lang='chi_sim').strip()
  81. def pre_ocr_processing(im):
  82. im = im.convert("RGB")
  83. width, height = im.size
  84. white = im.filter(ImageFilter.BLUR).filter(ImageFilter.MaxFilter(23))
  85. grey = im.convert('L')
  86. impix = im.load()
  87. whitepix = white.load()
  88. greypix = grey.load()
  89. for y in range(height):
  90. for x in range(width):
  91. greypix[x, y] = min(255, max(255 + impix[x, y][0] - whitepix[x, y][0],
  92. 255 + impix[x, y][1] -
  93. whitepix[x, y][1],
  94. 255 + impix[x, y][2] - whitepix[x, y][2]))
  95. new_im = grey.copy()
  96. binarize(new_im, 150)
  97. return new_im
  98. def binarize(im, thresh=120):
  99. assert 0 < thresh < 255
  100. assert im.mode == 'L'
  101. w, h = im.size
  102. for y in range(0, h):
  103. for x in range(0, w):
  104. if im.getpixel((x, y)) < thresh:
  105. im.putpixel((x, y), 0)
  106. else:
  107. im.putpixel((x, y), 255)
  108. if __name__ == '__main__':
  109. im = get_img()
  110. # im = Image.open("./tmp.jpg")
  111. try:
  112. print
  113. 'OCR Question:', ocr_question_extract(im)
  114. except Exception as e:
  115. print
  116. '<OCR failed>', e
  117. for y in range(2):
  118. for x in range(4):
  119. im2 = get_sub_img(im, x, y)
  120. result = baidu_stu_lookup(im2)
  121. print(y, x), result