温州网站制作_企业网站的开发_制作网站公司_长沙网站设计

张家界网站建设

惠州网站制作公司、北京 网站建设、永久使用、不限域名、网站建设建站

网站开发背景?

温州网站制作_企业网站的开发_制作网站公司_长沙网站设计

proxies = []def write_to_mongo(ips, city):'''将数据写入mongoDB'''client = Client(host='localhost', port=27017)db = client['fs_db']coll = db[city + '_good']for ip in ips:coll.insert_one({'name': ip[0], \'price': ip[1],'addresses': ip[2],'areas': ip[3],'eq': ip[4]})client.close()def read_from_mongo(city):client = Client(host='localhost', port=27017)db = client['fs_db']coll = db[city + '_good']li = coll.find()client.close()return liclass Consumer(threading.Thread):def __init__(self, args):threading.Thread.__init__(self, args=args)def run(self):global is_crawurl_demo, i, city_id, lock = self._argsprint("{}, 第{}页".format(city[city_id], i))url = url_demo.format(i)soup = get_real(url)names = []for name in soup.select('.tit_shop'):names.append(name.text.strip())addresses = []for item in soup.find_all('p', attrs={'class': 'add_shop'}):address = item.a.text + " " + item.span.textaddresses.append(address.replace('\t', '').replace('\n', ''))es = []for item in soup.find_all('p', attrs={'class': 'tel_shop'}):es.append(item.text.replace('\t', '').replace('\n', ''))moneys = []for money in soup.find_all("span", attrs={"class": 'red'}):moneys.append(money.text.strip())areas = []for area in soup.find_all('dd', attrs={'class': 'price_right'}):areas.append(area.find_all('span')[-1].text)houses = []for idx in range(len(names)):try:item = [names[idx], moneys[idx], addresses[idx], areas[idx], es[idx]]print(item)houses.append(item)except Exception as e:print(e)lock.acquire()write_to_mongo(houses, e_city[city_id])lock.release()print("线程结束{}".format(i))def dict2proxy(dic):s = dic['type'] + '://' + dic['ip'] + ':' + str(dic['port'])return {'http': s, 'https': s}def get_real(url):resp = requests.get(url, headers=header)soup = BeautifulSoup(resp.content, 'html.parser', from_encoding='gb18030')if soup.find('title').text.strip() == '跳转...':pattern1 = re.compile(r"var t4='(.*?)';")script = soup.find("script", text=pattern1)t4 = pattern1.search(str(script)).group(1)pattern1 = re.compile(r"var t3='(.*?)';")script = soup.find("script", text=pattern1)t3 = re.findall(pattern1, str(script))[-2]url = t4 + '?' + t3HTML = requests.get(url, headers=header)soup = BeautifulSoup(HTML.content, 'html.parser', from_encoding='gb18030')elif soup.find('title').text.strip() == '访问验证-房天下':passreturn soupdef read_proxies():client = Client(host='localhost', port=27017)db = client['proxies_db']coll = db['proxies']# 先检测,再写入,防止重复dic = list(coll.find())client.close()return dicdef craw():lock = threading.Lock()for idx in trange(len(e_city)):url = eshouse[idx]soup = get_real(url.format(2))try:page_number = int(soup.find('div', attrs={'class': 'page_al'}).find_all('span')[-1].text[1:-1])pages = list(range(1, page_number + 1))except:pages = list(range(1, 101))url_demo = urlts = []# pages = [1, 2, 3]while len(pages) != 0:for i in range(10):t = Consumer((url_demo, pages.pop(), idx, lock))t.start()ts.append(t)if len(pages) == 0:breakfor t in ts:t.join()ts.remove(t)if __name__ == '__main__':craw() 律师网站建设app的开发需要多少钱物流公司网站建设做网站的公司网站建站建设定制软件开发遵义网站建设做网站公司婚庆网站模板网站设计理念c2c网站建设赣州网站建设滁州网站建设手表网站模板龙岩网站制作网站设计论坛北京网站制作集团网站设计免费网站设计网站制作建设网站建设策划佛山网站建设济宁网站建设珠海网站设计大连网站建设泰安网站建设庆阳网站设计新疆网站设计是网站建设学校网站建设沧州网站制作

猜你喜欢