微信 答题王 +百度文字识别 + 百度搜索
pa_baidu(主程序) 精细化写法
# -*- coding: utf-8 -*- # 斌彬电脑 # @Time : 2018/10/14 0014 18:49 import requests from lxml import etree import word_recognition # 文字识别 def get_baidu(n): data = { \'ie\': \'utf-8\', \'f\': \'8\', \'rsv_bp\': \'1\', \'tn\': \'baidu\', \'wd\': n, # oq: %E7%99%BE%E5%BA%A6AI # rsv_pq: bcd001050002f85f # rsv_t: 917dLv7wOh1bo2MqUOyxMbUfkHk+wKfsn40nNpF801PthDw64QzEHZduOg4 \'rqlang\': \'cn\', \'rsv_enter\': \'1\', \'rsv_sug3\': \'2\', \'rsv_sug1\': \'2\', \'rsv_sug7\': \'100\', \'rsv_n\': \'2\', \'rsv_sug2\': \'0\', # inputT: \'748\' # rsv_sug4: 1520 } # url = \'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%E5%B9%BF%E6%92%AD%E7%94%B5%E5%8F%B0&oq=%25E7%2599%25BE%25E5%25BA%25A6AI&rsv_pq=bcd001050002f85f&rsv_t=917dLv7wOh1bo2MqUOyxMbUfkHk%2BwKfsn40nNpF801PthDw64QzEHZduOg4&rqlang=cn&rsv_enter=1&rsv_sug3=2&rsv_sug1=2&rsv_sug7=100&rsv_n=2&rsv_sug2=0&inputT=748&rsv_sug4=1520\' url = \'https://www.baidu.com/s\' h2 = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36\' } res = requests.get(url, data, headers=h2) # print(res.text) html = etree.HTML(res.text) # 百度搜索结果的每个连接装在列表中 ht = html.xpath(\'//h3[@class="t"]/*/@href\') # print(ht) li = [] for i in ht: res_ = requests.get(i, headers=h2) re = etree.HTML(res_.content) p = re.xpath(\'//*/text()\') # p = re.xpath(\'//meta/@content\') li.append(\'\'.join(p)) return li if __name__ == \'__main__\': b = word_recognition.BaiDuAPI(\'co.ini\') # 文字误国后得到列表 text = b.shi_bie(\'1.png\') # 将列表里的元素的长度放入 l 的列表 l = [] for i in text: l.append(len(i)) # print(l) # 找出最长的元素 x = 0 for i in range(len(l)): if i + 1 != len(l): if l[i] > x: x = l[i] else: x = x # 通过这个下标找到题目 tx = text[l.index(x)] try: print(\'题目是:\', tx) print(\'正在百度搜索答案......\') # 百度搜索得到列表 a a = get_baidu(tx) # 把列表拼接成字符串 a = \'\'.join(a) # print(a) if text[l.index(x) + 1] in a: print(\'答案是:1 \', text[l.index(x) + 1]) elif text[l.index(x) + 2] in a: print(\'答案是:2 \', text[l.index(x) + 2]) elif text[l.index(x) + 3] in a: print(\'答案是:3 \', text[l.index(x) + 3]) elif text[l.index(x) + 4] in a: print(\'答案是: 4 \', text[l.index(x) + 4]) # print(\'\'.join(a)) except Exception as e: print(\'找不到\')
pa_baidu(主程序)
# -*- coding:utf-8 -*- # 斌彬电脑 # @Time : 2018/9/9 0009 下午 1:17 import requests from lxml import etree import word_recognition # 文字识别 def get_baidu(n): data = { \'ie\': \'utf-8\', \'f\': \'8\', \'rsv_bp\': \'1\', \'tn\': \'baidu\', \'wd\':n, # oq: %E7%99%BE%E5%BA%A6AI # rsv_pq: bcd001050002f85f # rsv_t: 917dLv7wOh1bo2MqUOyxMbUfkHk+wKfsn40nNpF801PthDw64QzEHZduOg4 \'rqlang\': \'cn\', \'rsv_enter\':\'1\', \'rsv_sug3\': \'2\', \'rsv_sug1\': \'2\', \'rsv_sug7\': \'100\', \'rsv_n\': \'2\', \'rsv_sug2\': \'0\', # inputT: \'748\' # rsv_sug4: 1520 } # url = \'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%E5%B9%BF%E6%92%AD%E7%94%B5%E5%8F%B0&oq=%25E7%2599%25BE%25E5%25BA%25A6AI&rsv_pq=bcd001050002f85f&rsv_t=917dLv7wOh1bo2MqUOyxMbUfkHk%2BwKfsn40nNpF801PthDw64QzEHZduOg4&rqlang=cn&rsv_enter=1&rsv_sug3=2&rsv_sug1=2&rsv_sug7=100&rsv_n=2&rsv_sug2=0&inputT=748&rsv_sug4=1520\' url = \'https://www.baidu.com/s\' h2 = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36\' } res = requests.get(url, data, headers=h2) # print(res.text) html = etree.HTML(res.text) # 百度搜索结果的每个连接装在列表中 ht = html.xpath(\'//h3[@class="t"]/*/@href\') # print(ht) li = [] for i in ht: res_ = requests.get(i, headers=h2) re = etree.HTML(res_.text) f = re.xpath(\'//meta/@charset\') # 如果编码是 utf8 的 if \'\'.join(f) == \'utf-8\' or \'\'.join(f) ==\'UTF-8\': # print(1111) # 以 utf8 编码得到 html re = etree.HTML(res_.content.decode()) # 从 html 中匹配到可以被搜索引擎爬到的信息 p = re.xpath(\'//meta/@content\') if p: # p 为列表,用 join 方法转为 str, # return \'\'.join(p) li.append(\'\'.join(p)) elif \'\'.join(f) == \'gb2312\' or \'\'.join(f) ==\'gbk\' or \'\'.join(f) ==\'GBK\': # print(2222) re = etree.HTML(res_.content.decode(\'GBK\')) p = re.xpath(\'//meta/@content\') if p: # p 为列表,用 join 方法转为 str, # return \'\'.join(p) li.append(\'\'.join(p)) # elif \'\'.join(f) == \'\': # print(333) # re = etree.HTML(res_.content.decode(\'GBK\')) # p = re.xpath(\'//meta/@content\') # if p: # # p 为列表,用 join 方法转为 str, # return \'\'.join(p) return li if __name__ == \'__main__\': b = word_recognition.BaiDuAPI(\'co.ini\') # 文字误国后得到列表 text = b.shi_bie(\'2.png\') # 将列表里的元素的长度放入 l 的列表 l = [] for i in text: l.append(len(i)) # print(l) # 找出最长的元素 x =0 for i in range(len(l)): if i+1 != len(l): if l[i]>x: x = l[i] else:x = x # 通过这个下标找到题目 tx =text [l.index(x)] try: print(\'题目是:\',tx) print(\'正在百度搜索答案......\') # 百度搜索得到列表 a a = get_baidu(tx) # 把列表拼接成字符串 a = \'\'.join(a) # print(a) if text [l.index(x)+1] in a: print(\'答案是:1 \', text [l.index(x)+1] ) elif text [l.index(x)+2] in a: print(\'答案是:2 \',text [l.index(x)+2]) elif text [l.index(x)+3] in a: print(\'答案是:3 \',text [l.index(x)+3]) elif text [l.index(x)+4] in a: print(\'答案是: 4 \',text [l.index(x)+4]) # print(\'\'.join(a)) except Exception as e: print(\'找不到\')
百度文字识别 word_recognition
# -*- coding:utf-8 -*- # 斌彬电脑 # @Time : 2018/9/9 0009 下午 2:00 import configparser # 读写配置文件 from aip import AipOcr # pip install baidu-aip 百度文字识别 class BaiDuAPI( object ): # 父类 \'\'\'用于文字识别\'\'\' def __init__(self,filePath): target = configparser.ConfigParser() # 将百度得到 KEY 写入 co.ini, target.read( filePath, encoding=\'utf-8-sig\' ) app_id = target.get(\'我的 KEY\',\'APP_ID\') api_key = target.get(\'我的 KEY\',\'API_KEY\') secret_key = target.get(\'我的 KEY\',\'SECRET_KEY\') self.client = AipOcr(app_id, api_key, secret_key) # 百度提供 @staticmethod # 静态方法 def get_a(filePath): \'\'\'用于读取图片\'\'\' with open( filePath,\'rb\' ) as f: return f.read() def shi_bie(self,filePath): \'\'\'将图片成文字\'\'\' image = self.get_a(filePath) texts = self.client.basicGeneral(image) # 百度提供 # print(texts) # texts = texts[\'words_result\'][0][\'words\'] te = [] for i in texts[\'words_result\']: # 取到最后,如果没有 words 返回 空 # te = te + \'\'.join(i.get(\'words\',\'\') ) te.append(i[\'words\']) # print(te) return te if __name__ ==\'__main__\': d = BaiDuAPI(\'co.ini\') text = d.shi_bie(\'1.png\') print(text[7]) print(text[8]) print(text[9]) print(text[10]) print(text[11])
co.ini
[我的 KEY]
APP_ID = 123
API_KEY = abcL
SECRET_KEY = dddu
;client = AipOcr(APP_ID, API_KEY, SECRET_KEY)