pa_baidu(主程序) 精细化写法
# -*- coding: utf-8 -*-
# 斌彬电脑
# @Time : 2018/10/14 0014 18:49

import requests
from lxml import etree
import word_recognition  # 文字识别

def get_baidu(n):
    data = {
        \'ie\': \'utf-8\',
        \'f\': \'8\',
        \'rsv_bp\': \'1\',
        \'tn\': \'baidu\',
        \'wd\': n,
        # oq: %E7%99%BE%E5%BA%A6AI
        # rsv_pq: bcd001050002f85f
        # rsv_t: 917dLv7wOh1bo2MqUOyxMbUfkHk+wKfsn40nNpF801PthDw64QzEHZduOg4
        \'rqlang\': \'cn\',
        \'rsv_enter\': \'1\',
        \'rsv_sug3\': \'2\',
        \'rsv_sug1\': \'2\',
        \'rsv_sug7\': \'100\',
        \'rsv_n\': \'2\',
        \'rsv_sug2\': \'0\',
        # inputT: \'748\'
        # rsv_sug4: 1520
    }
    # url = \'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%E5%B9%BF%E6%92%AD%E7%94%B5%E5%8F%B0&oq=%25E7%2599%25BE%25E5%25BA%25A6AI&rsv_pq=bcd001050002f85f&rsv_t=917dLv7wOh1bo2MqUOyxMbUfkHk%2BwKfsn40nNpF801PthDw64QzEHZduOg4&rqlang=cn&rsv_enter=1&rsv_sug3=2&rsv_sug1=2&rsv_sug7=100&rsv_n=2&rsv_sug2=0&inputT=748&rsv_sug4=1520\'
    url = \'https://www.baidu.com/s\'

    h2 = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36\'
    }

    res = requests.get(url, data, headers=h2)
    # print(res.text)
    html = etree.HTML(res.text)

    # 百度搜索结果的每个连接装在列表中
    ht = html.xpath(\'//h3[@class="t"]/*/@href\')
    # print(ht)
    li = []
    for i in ht:
        res_ = requests.get(i, headers=h2)
        re = etree.HTML(res_.content)
        p = re.xpath(\'//*/text()\')
        # p = re.xpath(\'//meta/@content\')
        li.append(\'\'.join(p))

    return li

if __name__ == \'__main__\':
    b = word_recognition.BaiDuAPI(\'co.ini\')
    # 文字误国后得到列表
    text = b.shi_bie(\'1.png\')
    # 将列表里的元素的长度放入 l 的列表
    l = []
    for i in text:
        l.append(len(i))
    # print(l)
    # 找出最长的元素
    x = 0
    for i in range(len(l)):
        if i + 1 != len(l):
            if l[i] > x:
                x = l[i]
            else:
                x = x

    # 通过这个下标找到题目
    tx = text[l.index(x)]
    try:
        print(\'题目是:\', tx)
        print(\'正在百度搜索答案......\')
        # 百度搜索得到列表 a
        a = get_baidu(tx)
        # 把列表拼接成字符串
        a = \'\'.join(a)
        # print(a)
        if text[l.index(x) + 1] in a:
            print(\'答案是:1 \', text[l.index(x) + 1])
        elif text[l.index(x) + 2] in a:
            print(\'答案是:2  \', text[l.index(x) + 2])
        elif text[l.index(x) + 3] in a:
            print(\'答案是:3 \', text[l.index(x) + 3])
        elif text[l.index(x) + 4] in a:
            print(\'答案是: 4 \', text[l.index(x) + 4])
        # print(\'\'.join(a))
    except Exception as e:
        print(\'找不到\')

  

pa_baidu(主程序)
# -*- coding:utf-8 -*-
# 斌彬电脑
# @Time :   2018/9/9 0009    下午 1:17

import requests
from lxml import etree
import  word_recognition #  文字识别
def get_baidu(n):
    data = {
        \'ie\': \'utf-8\',
        \'f\': \'8\',
        \'rsv_bp\': \'1\',
        \'tn\': \'baidu\',
        \'wd\':n,
        # oq: %E7%99%BE%E5%BA%A6AI
        # rsv_pq: bcd001050002f85f
        # rsv_t: 917dLv7wOh1bo2MqUOyxMbUfkHk+wKfsn40nNpF801PthDw64QzEHZduOg4
        \'rqlang\': \'cn\',
        \'rsv_enter\':\'1\',
        \'rsv_sug3\': \'2\',
        \'rsv_sug1\': \'2\',
        \'rsv_sug7\': \'100\',
        \'rsv_n\': \'2\',
        \'rsv_sug2\': \'0\',
        # inputT: \'748\'
        # rsv_sug4: 1520
    }
    # url = \'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%E5%B9%BF%E6%92%AD%E7%94%B5%E5%8F%B0&oq=%25E7%2599%25BE%25E5%25BA%25A6AI&rsv_pq=bcd001050002f85f&rsv_t=917dLv7wOh1bo2MqUOyxMbUfkHk%2BwKfsn40nNpF801PthDw64QzEHZduOg4&rqlang=cn&rsv_enter=1&rsv_sug3=2&rsv_sug1=2&rsv_sug7=100&rsv_n=2&rsv_sug2=0&inputT=748&rsv_sug4=1520\'
    url = \'https://www.baidu.com/s\'

    h2 = {
       \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36\'
        }

    res = requests.get(url, data, headers=h2)
    # print(res.text)
    html = etree.HTML(res.text)

    # 百度搜索结果的每个连接装在列表中
    ht = html.xpath(\'//h3[@class="t"]/*/@href\')
    # print(ht)
    li = []
    for i in ht:
        res_ = requests.get(i, headers=h2)
        re = etree.HTML(res_.text)
        f = re.xpath(\'//meta/@charset\')
        #  如果编码是  utf8 的
        if \'\'.join(f) == \'utf-8\' or \'\'.join(f) ==\'UTF-8\':
            # print(1111)
            # 以 utf8 编码得到 html
            re = etree.HTML(res_.content.decode())
            #  从 html 中匹配到可以被搜索引擎爬到的信息
            p = re.xpath(\'//meta/@content\')
            if p:
                # p 为列表,用 join 方法转为 str,
                # return \'\'.join(p)
                li.append(\'\'.join(p))
        elif \'\'.join(f) == \'gb2312\' or \'\'.join(f) ==\'gbk\' or \'\'.join(f) ==\'GBK\':
            # print(2222)
            re =  etree.HTML(res_.content.decode(\'GBK\'))
            p = re.xpath(\'//meta/@content\')
            if p:
                # p 为列表,用 join 方法转为 str,
                # return \'\'.join(p)
                li.append(\'\'.join(p))
        # elif \'\'.join(f) == \'\':
            # print(333)
            # re =  etree.HTML(res_.content.decode(\'GBK\'))
            # p = re.xpath(\'//meta/@content\')
            # if p:
            #     # p 为列表,用 join 方法转为 str,
            #     return \'\'.join(p)


    return li

if __name__ == \'__main__\':
    b = word_recognition.BaiDuAPI(\'co.ini\')
    # 文字误国后得到列表
    text = b.shi_bie(\'2.png\')
    # 将列表里的元素的长度放入 l 的列表
    l = []
    for i in text:
        l.append(len(i))
    # print(l)
    # 找出最长的元素
    x =0
    for i in range(len(l)):
        if i+1 != len(l):
            if l[i]>x:
                x = l[i]
            else:x = x

    # 通过这个下标找到题目
    tx =text [l.index(x)]
    try:
        print(\'题目是:\',tx)
        print(\'正在百度搜索答案......\')
        # 百度搜索得到列表 a
        a = get_baidu(tx)
        # 把列表拼接成字符串
        a = \'\'.join(a)
        # print(a)
        if text [l.index(x)+1] in a:
            print(\'答案是:1 \', text [l.index(x)+1] )
        elif text [l.index(x)+2] in a:
            print(\'答案是:2  \',text [l.index(x)+2])
        elif text [l.index(x)+3] in a:
            print(\'答案是:3 \',text [l.index(x)+3])
        elif text [l.index(x)+4] in a:
            print(\'答案是: 4 \',text [l.index(x)+4])
        # print(\'\'.join(a))
    except Exception as e:
        print(\'找不到\')

  

百度文字识别    word_recognition

# -*- coding:utf-8 -*-
# 斌彬电脑
# @Time :   2018/9/9 0009    下午 2:00

import configparser        #  读写配置文件
from aip import AipOcr    # pip install baidu-aip  百度文字识别


class BaiDuAPI( object ):       #  父类
    \'\'\'用于文字识别\'\'\'
    def __init__(self,filePath):
        target = configparser.ConfigParser()

        #  将百度得到 KEY 写入 co.ini,
        target.read( filePath, encoding=\'utf-8-sig\' )
        app_id = target.get(\'我的 KEY\',\'APP_ID\')
        api_key = target.get(\'我的 KEY\',\'API_KEY\')
        secret_key = target.get(\'我的 KEY\',\'SECRET_KEY\')
        self.client = AipOcr(app_id, api_key, secret_key)       # 百度提供

    @staticmethod       # 静态方法
    def get_a(filePath):
        \'\'\'用于读取图片\'\'\'
        with open( filePath,\'rb\' ) as f:
            return f.read()

    def shi_bie(self,filePath):
        \'\'\'将图片成文字\'\'\'
        image = self.get_a(filePath)
        texts = self.client.basicGeneral(image)      #  百度提供
        # print(texts)
        # texts = texts[\'words_result\'][0][\'words\']
        te = []
        for i in texts[\'words_result\']:
            #        取到最后,如果没有 words 返回 空
            # te = te + \'\'.join(i.get(\'words\',\'\') )
            te.append(i[\'words\'])
        # print(te)
        return te

if __name__ ==\'__main__\':
    d = BaiDuAPI(\'co.ini\')
    text = d.shi_bie(\'1.png\')
    print(text[7])
    print(text[8])
    print(text[9])
    print(text[10])
    print(text[11])

  

 

 

co.ini

[我的 KEY]
APP_ID = 123
API_KEY = abcL
SECRET_KEY = dddu
;client = AipOcr(APP_ID, API_KEY, SECRET_KEY)

 

版权声明:本文为gdwz922原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/gdwz922/articles/9617832.html