今天要用昨天训练好的模型来试试看能否顺利从我们的目标网站取得资讯!
我们要先用selenium来处理网站的下拉式选单。
第一个是发证日期的年月日,我们可以观察到年的部分要抓取的话,例如109年就直接输入109就可以抓取到了。
月和日的部分也是。
接下来发证地点这里我们就要先给它编号了。
例如北县,我们就必须给他10001的值。
place_dict = {
"10001":"北县","10002":"宜县",
"10003":"桃县","10004":"竹县",
"10005":"苗县","10006":"中县",
"10007":"彰县","10008":"投县",
"10009":"云县","10010":"嘉县",
"10011":"南县","10012":"高县",
"10013":"屏县","10014":"东县",
"10015":"花县","10016":"澎县",
"10017":"基市","10018":"竹市",
"10020":"嘉市","09007":"连江",
"09020":"金门","63000":"北市",
"64000":"高市","65000":"新北市",
"66000":"中市","67000":"南市",
"68000":"桃市"
}
而领补换类别也是,我们需要给他编号才能抓取的到。
reason_dict = {
"1":"初发",
"2":"补发",
"3":"换发",
}
# 我们在输入资料的时候,还是会打北县,换发等等,所以这里我把key和value值对调一下,才能成功输入对应的编号
place_key = {v : k for k ,v in place_dict.items()}
reason_key = {v : k for k ,v in reason_dict.items()}
我们一样在背景执行。
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://www.ris.gov.tw/apply-idCard/app/idcard/IDCardReissue/main")
这个程序用来处理我们的身分证资讯。
def person_information(ID,year,month,date,place,reason):
# 输入身分证字号
userid = driver.find_element_by_xpath("//input[@id='idnum94']").clear()
userid = driver.find_element_by_xpath("//input[@id='idnum94']")
userid.send_keys(ID)
# 选年
select = Select(driver.find_element_by_name("applyTWY"))
select.select_by_value(year)
# 选月
select = Select(driver.find_element_by_name("applyMM"))
select.select_by_value(month)
# 选日
select = Select(driver.find_element_by_name("applyDD"))
select.select_by_value(date)
# 选发证地点
select = Select(driver.find_element_by_name("siteId"))
select.select_by_value(place_key[place])
# 选领补换发类别
select = Select(driver.find_element_by_name("applyReason"))
select.select_by_value(reason_key[reason])
填选完之後我们要输入验证码,所以我们必须先抓到当下页面的验证码图片。作法一样,我们先全屏截图之後,撷取图片的位置,不用落地,直接取得。
def captcha_pic(filepath_input):
scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth')
scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
driver.set_window_size(scroll_width, scroll_height)
driver.save_screenshot(filepath_input)
# 网页中图片验证码的位置
element = driver.find_element_by_xpath('//*[@id="captchaImage_captcha-refresh"]')
left = element.location['x']
right = element.location['x'] + element.size['width']
top = element.location['y']
bottom = element.location['y'] + element.size['height']
img = Image.open(filepath_input)
img = img.crop((left, top, right, bottom))
img = img.convert("RGB")
return img
我们还会需要读取模型。
def model_for_captcha(model_path):
model = models.densenet201(num_classes=180)
model.load_state_dict(torch.load(model_path))
return model
再来就是将刚刚抓取的图片丢入模型预测是甚麽字啦~~
def captcha_answer(img,model):
transforms = Compose([ToTensor()])
img = transforms(img)
model.eval()
img = img.view(1, 3, 100, 240)
output = model(img)
output = output.view(-1, 36)
output = nn.functional.softmax(output, dim=1)
output = torch.argmax(output, dim=1)
output = output.view(-1, 5)[0]
pred = ''.join([alphabet[i] for i in output.cpu().numpy()])
return pred , output
最後我们将全部整合在一起,我们会用到while回圈,因为万一预测的验证码是错的,我们必须重新自动再跑一次,直到正确为止。
# 执行while回圈,直到成功填写正确验证码
def work_all(ID,year,month,date,place,reason,filepath_input,model_path,filepath_output):
while True:
person_information(ID,year,month,date,place,reason)
img = captcha_pic(filepath_input)
model = model_for_captcha(model_path)
pred , output = captcha_answer(img,model)
print("==========================================")
print('pred: ' + ''.join([alphabet[i] for i in output.cpu().numpy()]))
# 清除验证码空格,并输入验证码
captcha = driver.find_element_by_xpath("//input[@id='captchaInput_captcha-refresh']").clear()
captcha = driver.find_element_by_xpath("//input[@id='captchaInput_captcha-refresh']")
captcha.send_keys(pred)
# 点选送出
submit = driver.find_element_by_xpath("//button[@class = 'btn btn-primary query']")
submit.click()
time.sleep(5)
try:
# 验证码输入错误
driver.find_element_by_xpath("//div[@class='error-message']")
print("验证码输入错误,重新填写验证码")
print("==========================================")
continue
except:
user_info = {}
userID = driver.find_element_by_xpath("//*[@id='resultBlock']/div[2]/div[2]/div[1]/div[2]/table/tbody/tr[1]/td/strong").text
pub_date = driver.find_element_by_xpath("//*[@id='resultBlock']/div[2]/div[2]/div[1]/div[2]/table/tbody/tr[2]/td/strong").text
pub_place = driver.find_element_by_xpath("//*[@id='resultBlock']/div[2]/div[2]/div[1]/div[2]/table/tbody/tr[3]/td/strong").text
pub_reason = driver.find_element_by_xpath("//*[@id='resultBlock']/div[2]/div[2]/div[1]/div[2]/table/tbody/tr[4]/td/strong").text
final_answer = driver.find_element_by_xpath("//*[@id='resultBlock']/div[2]/div[2]/div[1]/div[2]/div[2]/div").text
# 送出的截图
scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth')
scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
driver.set_window_size(scroll_width, scroll_height)
# driver.save_screenshot(filepath_output)
final_base64 = driver.get_screenshot_as_base64()
user_info["身分证字号"] = userID
user_info["发证日期"] = pub_date
user_info["发证地点"] = pub_place
user_info["领补换类别"] = pub_reason
user_info["是否为最新资料"] = final_answer
user_info["最终截图base64码"] = final_base64
print("验证码输入正确,已储存图片")
print("==========================================")
break
return user_info
给定各个参数之後,执行结果发现准确度其实满好的,试了大概20次,全部都一次就得到我们要得结果了~
# 这些参数就输入你们的身分证资讯,加上自己定义你们要存的位置和你们的模型
ID =
year =
month =
date =
place =
reason =
filepath_input =
model_path =
filepath_output =
user_info = work_all(ID,year,month,date,place,reason,filepath_input,model_path,filepath_output)
pprint.pprint(user_info)
这是我们填完资讯之後,尚未填入验证码的画面。
而我们的模型预测也正确。
最後也成功得到我们要的资讯啦~
-
今日小结
验证码辨识可以说是在学图像辨识时,最简单也最好蒐集图片的一个练习。刚好公司有用到,就来分享给大家,如有写的不好的地方,欢迎鞭打我XDDD。