1 # -*- coding:utf-8 -*- 2 import os, pymysql,csv,configparser,pickle 3 from selenium import webdriver 4 from user_agent import generate_user_agent 5 6 7 global csvpath 8 global companypath 9 global cookiedumped,csvinited 10 global debugmode 11 global browser_loaded 12 global export 13 global chromedriver 14 15 browser_loaded=0 16 csvinited=0 17 18 #读取配置文件 19 config=configparser.RawConfigParser() 20 config.read('config.cfg') 21 debugmode=int(config.get("config",'debugmode')) 22 cookiedumped=int(config.get("config",'cookiedumped')) 23 csvpath=config.get("config",'csvpath') 24 export=int(config.get("config",'export')) 25 companypath=config.get("config",'companypath') 26 chromedriver=config.get("config","chromedriver") 27 28 29 import time 30 def dur( op=None, clock=[time.time()] ): 31 if op != None: 32 duration = time.time() - clock[0] 33 print ('%s finished. Duration %.6f seconds.' % (op, duration)) 34 clock[0] = time.time() 35 36 def durt( op=None, clock=[time.time()] ): 37 if op != None: 38 duration = time.time() - clock[0] 39 print ('%s finished. Duration %.6f seconds.' % (op, duration)) 40 clock[0] = time.time() 41 42 def init_db(): 43 global CONNECTION 44 CONNECTION = pymysql.connect("地址", "用户名", "密码", "数据库", use_unicode=True, charset="utf8") 45 46 47 def close_db(): 48 CONNECTION.close() 49 50 51 def init_web_driver(opt1=0): 52 global DRIVER, browser_loaded 53 user_agent = generate_user_agent() 54 co = webdriver.ChromeOptions() 55 # Chrome driver default setting under Windows OS 56 co.add_argument('--disable-gpu') 57 58 if opt1 == 0: 59 # Set the Chrome in headless mode 60 co.add_argument('--headless') 61 # Disable images loading 62 co.add_argument('blink-settings=imagesEnabled=false') 63 64 # Add User-Agent Profile 65 co.add_argument('--user-agent={}'.format(user_agent)) 66 67 # Initialize Chrome 68 DRIVER = webdriver.Chrome( 69 chrome_options=co, 70 executable_path=chromedriver, 71 service_log_path=os.path.devnull 72 ) 73 browser_loaded=1 74 print('Chrome process loaed.') 75 76 77 def close_web_driver(): 78 DRIVER.quit() 79 80 81 def spider_create_cookie(): 82 init_web_driver(debugmode) 83 DRIVER.get('https://www.qichacha.com/user_login') 84 DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click() 85 time.sleep(10) 86 print(DRIVER.current_url) 87 cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()] 88 print('Cookies Loaded' + '/n' + cookie) 89 pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb")) 90 close_web_driver() 91 browser_loaded=0 92 print('Cookies created.') 93 94 95 def write_csv(inputstr, filename='result.csv',opt='a+'): 96 97 if filename.strip()=='': 98 filename='result.csv' 99 #with open(filename, 'a+',newline='') as f:100 with open(filename, opt, newline='') as f:101 writer = csv.writer(f, dialect='excel')102 writer.writerow(inputstr)103 f.close()104 print('CSV writed.')105 106 def init_csv():107 headline=['搜索项','企业名称', '电话', '官网', '地址', '注册资本', '实缴资本',108 '经营状态', '成立日期', '统一社会信用代码', '纳税人识别号',109 '注册号', '组织机构代码', '公司类型', '所属行业', '核准日期',110 '登记机关', '所属地区', '英文名', '曾用名', '经营方式', '人员规模',111 '营业期限', '企业地址','经营范围']112 write_csv(headline,csvpath,'w+')113 global csvinited114 csvinited=1115 print('Output CSV ready.')116 117 118 119 #def write_sql():120 121 122 123 def get_companylist(filename='company.csv'):124 125 126 company_list = []127 f = open(filename, 'r')128 # company_list=f.readlines()129 for line in f.readlines():130 company_list.append(line.replace('\n', ''))131 return company_list132 print('Company list loaded.')133 134 def table_reduction(searchitem,table, opt=1):135 table_rows = table.find_elements_by_tag_name('tr')136 137 #table_rows = table.find_elements_by_tag_name('tr')138 query_result = []139 query_result.append(searchitem)140 # 企业名称:141 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div/div[2]/div[1]/h1').text)142 # 电话:143 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[1]/span[2]/span').text)144 # 官网:145 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[3]').text)146 # 地址:147 query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[3]/span[3]/a[1]').text)148 149 # 注册资本:150 query_result.append(table_rows[0].find_elements_by_tag_name('td')[1].text)151 152 # 实缴资本:153 query_result.append(table_rows[0].find_elements_by_tag_name('td')[3].text)154 155 # 经营状态:156 query_result.append(table_rows[1].find_elements_by_tag_name('td')[1].text)157 158 # 成立日期:159 query_result.append(table_rows[1].find_elements_by_tag_name('td')[3].text)160 161 # 统一社会信用代码:162 query_result.append(table_rows[2].find_elements_by_tag_name('td')[1].text)163 164 # 纳税人识别号:165 query_result.append(table_rows[2].find_elements_by_tag_name('td')[3].text)166 167 # 注册号:168 query_result.append(table_rows[3].find_elements_by_tag_name('td')[1].text)169 170 # 组织机构代码:171 query_result.append(table_rows[3].find_elements_by_tag_name('td')[3].text)172 173 # 公司类型:174 query_result.append(table_rows[4].find_elements_by_tag_name('td')[1].text)175 176 # 所属行业:177 query_result.append(table_rows[4].find_elements_by_tag_name('td')[3].text)178 179 # 核准日期:180 query_result.append(table_rows[5].find_elements_by_tag_name('td')[1].text)181 182 # 登记机关:183 query_result.append(table_rows[5].find_elements_by_tag_name('td')[3].text)184 185 # 所属地区:186 query_result.append(table_rows[6].find_elements_by_tag_name('td')[1].text)187 188 # 英文名:189 query_result.append(table_rows[6].find_elements_by_tag_name('td')[3].text)190 191 # 曾用名:192 query_result.append(table_rows[7].find_elements_by_tag_name('td')[1].text)193 194 # 经营方式:195 query_result.append(table_rows[7].find_elements_by_tag_name('td')[3].text)196 197 # 人员规模:198 query_result.append(table_rows[8].find_elements_by_tag_name('td')[1].text)199 200 # 营业期限:201 query_result.append(table_rows[8].find_elements_by_tag_name('td')[3].text)202 203 # 企业地址:204 query_result.append(table_rows[9].find_elements_by_tag_name('td')[1].text)205 206 # 注册资本:207 query_result.append(table_rows[10].find_elements_by_tag_name('td')[1].text)208 209 210 211 #if export == 1: # Write in MYSQL212 213 if export == 0: # Write in local csv214 write_csv(query_result,csvpath)215 216 #使用前获取Cookie217 def spider_create_cookie():218 init_web_driver(1)219 DRIVER.get('https://www.qichacha.com/user_login')220 DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click()221 time.sleep(10)222 print(DRIVER.current_url)223 cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()]224 pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb"))225 print('Cookies loaded.')226 global cookiedumped,browser_loaded227 cookiedumped=1228 DRIVER.close()229 browser_loaded = 0230 def visit_webpage(company_name):231 232 '''233 Dump Logined Cookies234 '''235 if cookiedumped==0:236 spider_create_cookie()237 if browser_loaded==1:238 DRIVER.find_element_by_id("headerKey").send_keys(company_name)239 DRIVER.find_element_by_xpath('/html/body/header/div/form/div/div/span/button').click()240 241 if cookiedumped==1 and browser_loaded==0:242 init_web_driver(debugmode)243 DRIVER.get('https://www.qichacha.com/')244 cookies = pickle.load(open("cookies.pkl", "rb"))245 for cookie in cookies:246 DRIVER.add_cookie(cookie)247 DRIVER.find_element_by_id("searchkey").send_keys(company_name)248 DRIVER.find_element_by_id("V3_Search_bt").click()249 250 251 252 253 DRIVER.get(DRIVER.find_element_by_class_name("ma_h1").get_attribute("href"))254 table = DRIVER.find_element_by_xpath('//*[@id="Cominfo"]/table[2]')255 if csvinited==0:256 init_csv()257 table_reduction(company_name,table)258 def main():259 import array260 global companys261 filename = './log/'+str(time.strftime('%Y-%m-%d_%H-%M', time.localtime(time.time()))) + '_ERROR.log'262 fp = open(filename, 'a+')263 companys=[]264 companys=get_companylist(companypath)265 i=1266 amount = len(companys)267 for items in companys:268 269 try:270 dur()271 visit_webpage(items)272 # t=timeit(visit_webpage(items))273 dur(str(i)+' of '+str(amount)+' '+items)274 i=i+1275 except:276 print(items+' FAILED TO CATCH')277 fp.write(str(time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())))+' items '+'FAILED TO LOAD')278 fp.close()279 280 281 282 #283 #284 #285 #286 #287 288 289 290 if __name__ == '__main__':291 durt()292 main()293 DRIVER.close()294 DRIVER.quit()295 print(str(len(companys))+' items finieshed! ')296 durt('TOTALY')
本地配置文件
[config]debugmode=0cookiedumped=0csvpath=Result.csvcompanypath=CompanyList.txtchromedriver=.\chromedriver.exeexport=0[sqlcon]ip_port=username=pwd=dbnanme=
本地企业列表
CompanyList.txt,每行放置一个企业名称或统一信用代码