import osimport sysimport codecsimport argparsefrom fake_useragent import UserAgentif sys.version_info0 >= 3: import urllib import urllib.request as request import urllib.error as urerelse: import urllib2 as request import urllib2 as urerimport socketfrom contextlib import closingfrom time import sleepimport reprint(len(sys.argv))def download_page(url, referer, mt, timeout, pause): t = 0 hp = None while t < mt and hp is None: try: code = 404 req = request.Request(url) req.add_header('Referer', referer) ua = UserAgent() req.add_header('User-agent',ua.random) with closing(request.urlopen(req, timeout=timeout)) as f: code = f.getcode() hp = f.read() sleep(pause) except (urer.urer, socket.timeout, socket.error): t += 1 if hp: return hp.decode('utf-8'), code else: return None, codedef main(): # sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser() parser.add_argument('-d', '--domain', help='Domain from which to download the reviews. Default: com', required=False, type=str, default='com') parser.add_argument('-f', '--force', help='Force download even if already successfully downloaded', required=False, action='store_true') parser.add_argument( '-r', '--mt', help='Max ret to download a file. Default: 3', required=False, type=int, default=3) parser.add_argument( '-t', '--timeout', help='Timeout in seconds for http connections. Default: 180', required=False, type=int, default=180) parser.add_argument( '-p', '--pause', help='Seconds to wait between http requests. Default: 1', required=False, default=1, type=float) parser.add_argument( '-m', '--maxreviews', help='Maximum number of reviews per item to download. Default:unlimited', required=False, type=int, default=-1) parser.add_argument( '-o', '--out', help='Output base path. Default: amazonreviews', type=str, default='amazonreviews') parser.add_argument('-c', '--captcha', help='Retry on captcha pages until captcha is not asked. Default: skip', required=False, action='store_true') parser.add_argument('ids', metavar='ID', nargs='+', help='Product IDs for which to download reviews') args = parser.parse_args() basepath = args.out + os.sep + args.domain counterre = re.compile('cm_cr_arp_d_paging_btm_(0-9+)') robotre = re.compile('images-amazon.com/captcha/') for id_ in args.ids: if not os.path.exists(basepath + os.sep + id_): os.makedirs(basepath + os.sep + id_) #urlPart1 = "http://www.amazon." + args.domain + "/product-reviews/" urlPart1 = "http://www.amazon.com/product-reviews/" urlPart2 = "/?ie=UTF8&showViewpoints=0&pageNumber=" urlPart3 = "&sortBy=bySubmissionDateDescending" referer = urlPart1 + str(id_) + urlPart2 + "1" + urlPart3 page = 1 lastPage = 1 while page <= lastPage: if not page == 1 and not args.force and os.path.exists(basepath + os.sep + id_ + os.sep + id_ + '_' + str( page) + '.html'): print('Already got page ' + str(page) + ' for product ' + id_) page += 1 continue url = urlPart1 + str(id_) + urlPart2 + str(page) + urlPart3 print(url) hp, code = download_page(url, referer, args.mt, args.timeout, args.pause) if hp is None or code != 200: if code == 503: page -= 1 args.pause += 2 print('(' + str(code) + ') Retrying downloading the URL: ' + url) else: print('(' + str(code) + ') Done downloading the URL: ' + url) break else: print('Got page ' + str(page) + ' out of ' + str(lastPage) + ' for product ' + id_ + ' timeout=' + str( args.pause)) if robotre.search(hp): print('timeout=' + str(args.pause)) if args.captcha or page == 1: args.pause *= 2 continue else: args.pause += 2 for m in counterre.findall(hp): try: vl = int(m) if vl > lastPage: lastPage = vl except: pass with codecs.open(basepath + os.sep + id_ + os.sep + id_ + ‘_’ + str(page) + ‘.html’, mode=’w’, encoding=’utf8′) as file: file.write(hp) if args.pause >= 2: args.pause -= 1 referer = urlPart1 + str(id_) + urlPart2 + str(page) + urlPart3 if args.maxreviews>0 and page*10>=args.maxreviews: break page += 1if __name__ == ‘__main__’: main()