# -*-coding:utf8-*-
# !/usr/bin/env python
import requests, json
import base64
class PdfOcr(object):
def __init__(self):
self.url = 'xxxx'
def ocr_pdf(self, filepath):
with open(filepath, "rb") as imageFile:
image_str = base64.b64encode(imageFile.read())
bodytmp = {
"image_base64": image_str,
# "template_name": name
}
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
res = requests.post(self.url, data=bodytmp, headers=headers, timeout=30)
html = res.content.decode('utf8')
main_content = json.loads(html)
rest = main_content['result']
result = ''
if rest is None:
return None
for i in rest:
result += i['words']
return result
if __name__ == '__main__':
F = PdfOcr()
fp = '20170507104235.pdf'
# name = "fhx_zg_template"
output = F.ocr_pdf(fp)
print(output)
网友评论