春运又到了,献上更新版抓黄牛脚本。
好不容易搞定了火车票(当然不是通过酷讯或者黄牛),把去年写过的抓黄牛脚本重写了一下,提供给各位还在等待购买火车票的 Programmer 使用。说是抓黄牛,自然还包括普通转票者。原理还是通过轮询酷讯网站上的内容,但是增加了几个新特性:
- 用 re 提供的正则表达式替换掉了 SGMLParser 提高效率
- 可以轮询多个地址了,比如我到吉安和井冈山都可以,所以我要遍历两个地址
- 可以将转向链接直接打印在屏幕上了
- 提供了 Python 3 的 Package 级支持,但是因为 re 模块变更,正则表达式在 Python 3 里无法运行,暂时没心思更新了。
尽管酷讯推出了秒杀器,不过还是觉得不妥,一是没任何输出,谁知道它是否真的能秒到,二是不跨平台,在 Mac 和 Linux 上暂时无法使用。
Patches are welcome. :-)
#!/usr/bin/python # encoding: utf-8 # # Catch the yellow cattles script # # Author: Xuqing Kuang [email protected]> # New features: # * Use regexp to instead of SGMLParser for performance # * Polling multiple URL at one time. # * Print out the redirect URL. # * Basic packages compatible with Python 3 # TODO: # * Use one regexp to split the href and text of link # * Update re package usage to compatible with Python 3import time
import os
import retry:
import urllib2 as urllib
except ImportError: # Python 3 compatible
import urllib.request, urllib.errorurls = (
"http://piao.kuxun.cn/beijing-jinggangshan/",
"http://piao.kuxun.cn/beijing-jian/",)
keyword = &apos3张&apos
sequence = 60class TrainTicket(object):
"""
Catch the yellow cattle
"""
def init(self, urls, keyword, sequence = 60):
self.urls = urls
self.keyword = keyword
self.sequence = sequence
self.cache=[]
self.html = &apos&apos
self.links = []
if hasattr(urllib, &aposbuild_opener&apos):
self.opener = urllib.build_opener()
else: # Python 3 compatible
self.opener = urllib.request.build_opener()
self.result = []
self.re_links = re.compile(&apos<a.?href=.?</a>&apos, re.I)
# self.re_element = re.compile(&apos&apos, re.I) # Hardcode at following
self.requests = []
for url in urls:
if hasattr(urllib, &aposRequest&apos):
request = urllib.Request(url)
else: # Python 3 compatible
request = urllib.request.Request(url)request.add_header(&aposUser-Agent&apos, &aposMozilla/5.0&apos) self.requests.append(request) def get_page(self, request): """ Open the page. """ try: self.html = self.opener.open(request).read() except urllib.HTTPError: return False return self.html def get_links(self, html = &apos&apos): """ Process the page, get all of links """ if not html: html = self.html self.links = self.re_links.findall(html) return self.links def get_element(self, link = &apos&apos): """ Process the link generated by self.get_links(). Return list of the href and text """ # FIXME: have no idea how to split the href and text with one regex # So use two regex for temporary solution href = re.findall(&apos(?<=href=").*?(?=")&apos, link) # Get the href attribute if not href: # Process the no href attr href = [&apos&apos] text = re.split(&apos(<.*?>)&apos, link)[2] # Get the text of link a. href.append(text) # Append to the list. return href def get_ticket(self, request = None): """ Generate the data structure of tickets for each URL. """ if not request: request = self.requests[0] self.get_page(request) self.get_links() i = 0 while i < len(self.links): link = self.get_element(self.links[i]) if not link: continue url = link[0] name = link[1] if name and name.find(keyword) >= 0 and url not in self.cache: self.result.append(( i, name, url, )) self.cache.append(url) i += 1 return self.result def print_tickets(self): """ Process all of URLS and print out the tickets information. """ while 1: self.result = [] try: print(&aposBegin retrive&apos) for request in self.requests: print(&aposBegin scan %s&apos % request.get_full_url()) self.get_ticket(request) print(&aposFound %s urls.&apos % len(self.links)) for r in self.result: print(&aposIndex: %s\nName: %s\nURL: %s\n&apos % ( r[0], r[1], r[2] )) print(&aposScan finished, begin sleep %s seconds&apos % self.sequence) time.sleep(self.sequence) except KeyboardInterrupt: exit() except: raise
if name == &apos__main__&apos:
tt = TrainTicket(urls, keyword, sequence)
tt.print_tickets()
版权所有丨转载请注明出处:https://kxq.io/archives/春运又到了献上更新版抓黄牛脚本