第一次
This commit is contained in:
commit
8354c7c677
10
.idea/.gitignore
generated
vendored
Normal file
10
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# 基于编辑器的 HTTP 客户端请求
|
||||
/httpRequests/
|
||||
# 依赖于环境的 Maven 主目录路径
|
||||
/mavenHomeManager.xml
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
9
.idea/jzd.iml
generated
Normal file
9
.idea/jzd.iml
generated
Normal file
@ -0,0 +1,9 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.12 (jzd)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_24" project-jdk-name="Python 3.12 (jzd)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/jzd.iml" filepath="$PROJECT_DIR$/.idea/jzd.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
1
Data/json/1/0003.json
Normal file
1
Data/json/1/0003.json
Normal file
File diff suppressed because one or more lines are too long
1
Data/json/1/67997bf16a7c2e2f7489a6f97ae8a83e.json
Normal file
1
Data/json/1/67997bf16a7c2e2f7489a6f97ae8a83e.json
Normal file
File diff suppressed because one or more lines are too long
1
Data/json/1233/0003.json
Normal file
1
Data/json/1233/0003.json
Normal file
File diff suppressed because one or more lines are too long
12
Data/text/1/0003.txt
Normal file
12
Data/text/1/0003.txt
Normal file
@ -0,0 +1,12 @@
|
||||
安從來【恬憺虚无靜也法道清靜精氣内持故】【其虚邪不能為害恬蹄廉反舊音淡是】以志閑而
|
||||
少欲心安而不懼形勞而不倦䢿【機息故少欲外紛靜故心】【然情欲两亡是非一貫起】
|
||||
【居皆適故】【不倦也氣從以順各從其欲皆得所願】【志不貪故所欲皆】【順心易足故所願】
|
||||
【必從以不異求故无難得他老子】【曰知足不辱知止還殆回以長乆】故美其食【順精麄也髮新】【校正云按别本】
|
||||
【美一】【作甘】任其服【隨美】【惡也】樂其俗【去傾】【慕也】髙下不相慕其民故曰朴 无
|
||||
末也是所謂心足也老子曰禍莫大於不知足咎莫大於欲 得故知足之足常足矣盖非謂物足者為知足錢足者乃為
|
||||
【知足矣不恣於欲是則朴同故聖人云我無】【欲而民自朴矣新校集云按别本日作日是】以嗜欲不能
|
||||
【目不妄視故嗜欲不能勞心与玄】【勞其目淫邪不能惑其心】【同故淫邪不能惑老子曰不見可】【情計两】【欲使心不乱又昍愚智賢不肖不懼於物故合於道】【亡不為】【人為腹不為】【聖】【謀府冥心一觀勝𧴥俱捐故心志保安合同於道庚桑楚田】
|
||||
汝形抱汝生无使汝思慮營營字新校正云按全元起註
|
||||
【云合】【於道数】所以能年皆度百歲而動作不衰者以其德全不危
|
||||
【不渉於危故德全也莊子曰執道者德全德全者形全形】【也】【全者聖人之道也又曰无為而性命不全者未之有也】【材謂材幹可】【帝曰人年老而無子者材力盡邪將天數然也】【以立身者】
|
||||
伯 曰女子七歲腎氣盛 齒更髮長
|
17
Data/text/1/67997bf16a7c2e2f7489a6f97ae8a83e.txt
Normal file
17
Data/text/1/67997bf16a7c2e2f7489a6f97ae8a83e.txt
Normal file
@ -0,0 +1,17 @@
|
||||
人造字轉注之法厥後文字滋生莫識其本許君特得義故入句部許書分部之精類皆如此曷言乎其
|
||||
窮流溯源次弟先後轉注之指顯然矣顧其言凡某本義也許君每部之下悉以類相從雖後人假用已
|
||||
之屬皆从某也有指本部而言者有指異部而言者久許必標其本義以从本形使古人造字之意了然
|
||||
若白部自皎字至皛字均從白得義老部自耋字至可覩此轉注之一大例也如假氣爲气假前爲歬假
|
||||
考字皆從老得義此指本部而言也【此類隨】【舉皆是】若三部憂爲㥑假和爲龢經傳多有之詐則氣入米部饋客
|
||||
兜部只載一文亦特爲立部者以王王龜黽等字由芻米也而气部訓云雲气也象形而本義見矣前入
|
||||
此而生故也又或本字之類實無所屬而他部之字刀部齊斷也而止部歬字訓云不行而進謂之歬从
|
||||
旣有从此者則亦自爲一部如才耑等部僅載一文止在舟上而本義見矣憂入久部爲和之行和入口
|
||||
上下亦不相比次則所从又指遠部而言萈部云寛部訓云相應若此之類嚮非許君顯爲表出誰復知
|
||||
字从此【草爲山羊細角】【者故不入艸部】厂部云虒字从此是也顧許造字之本旨乎至其訓釋之例又約有六曰本字如
|
||||
君分部之例悉有精義非如後世字書僅據偏㫄區言部讓相責讓也辵部道所行道也之屬曰偏㫄字
|
||||
別若井部荆下云罰辠也从刀井易曰井者法也【段】【氏如辵部延正行也言部誼人所宜也之屬曰疉本字】
|
||||
【云此易】【緯說】㓝與罰俱从刀而㓝特入井部蓋罰者持刀如刄部刄行遲久久也夋行夋夋也之屬曰雙聲字
|
||||
罵詈【罰字】【本注】系諸受法者荆者以刀治辠系諸執法者如㫄溥也俚聊也鼓郭也儒柔也之屬曰疊韻字如
|
||||
重在守法故入井部此正得古人造字之本意他若戸護也門聞也之屬曰連字如𧺆部趍趙久也辵部
|
||||
𢆶部幽本从山𢆶亦聲也然以𢆶得義故入𢆶部【𢆶】【注】唐逮及也之屬此六例者要皆訓明本義使人知形
|
||||
【云㣲】【也句部拘鉤字本从手从金句亦聲也然以句曲聲與義相爲表裏而六書之旨益】明則許書本義之
|
13
Data/text/1233/0003.txt
Normal file
13
Data/text/1233/0003.txt
Normal file
@ -0,0 +1,13 @@
|
||||
𬼘王幼遐前輩景槧元巾箱本旧有注编次體例与孫稼航京兆𫠦藏元刻遮𨹧陈元龍片玉词注本無異
|
||||
惟𬼘题𭈹分卷尚仍其旧曰片玉䏻实昉于陈少章𢴃劉必欽叙知之至今歎之例宋元时選刻昔贤诗词
|
||||
集最多如杜工部白真山集日本皆𩔖偏清真词以分 類为最初刻證以方千里楊泽民和词之诠弟
|
||||
並止扵兹谝之𮦀賦𩔖其𢿘適府陈允平𫠦淂獨𭐴则曰以追和在後可知惜元鈔诸脱时見幼遐𡖋
|
||||
未之校雠然词中分段如垂𡖋釣鬲浦蓮近俱皎汲古反戈校丁𠜇夲弟長是旧𠜇之善北必光𬼘阏
|
||||
逢之歲大果月老芝𭁡識于半雨樓西牕
|
||||
【案𫠦𠜇藥堂詩餘载秋霁一首證以萹次前後皆選𫟈成之作则秋霽亦當屬𰛓真苐宋元时槧】【监鷗汲古諸𠜇並未之錄入闻疑載疑云尔余校𫟈成词凡卅餘過正其诸敬𫠦得实多是夲鶩】【之】【之元夲景𠜇明鈔不相失其旧故故未及校订误𠁅以徴盡善猶憶出京时】【翁据陈氏𦾔注】
|
||||
鹜翁斤丶原録𢴃當将别为校勘记附刊卷末至以余攺㝎䨥頭蓮西七闋字句谓有神助
|
||||
雖使美成復生必無異词是亦好之𬼘不觉其誉之過也余旅沽上三月中更喪洗卒丶 未有以報鹜
|
||||
翁今復旋吳閶人亊業蕞間亊旧業每一展诵是偏輙愳為㝠 爹之負行将入浙㦯扵湖山勝𠁅
|
||||
少得清致重为校㝎与许榆園商𣙜付鍥亦𠯁為片玉蕩滌纎瑕且有以副良友涶诿庶㡬
|
||||
幸甚躰问记光諸戊戍之𭘾十月朔日
|
||||
下接弟二页
|
15
Log/OCR/2025-08-08.log
Normal file
15
Log/OCR/2025-08-08.log
Normal file
@ -0,0 +1,15 @@
|
||||
2025-08-08 09:46:01 INFO 任务:1233 START 总数:1
|
||||
2025-08-08 09:46:01 INFO 序号 用时 字数 列数 大小 宽度 高度 路径
|
||||
2025-08-08 09:46:14 INFO 1 13.29 491 20 4615 3706 6871 D:/Important/software/软件交接/重要/重/籍智达OCR软件/籍智达OCR软件/jzd.0129/图/0003.JPG
|
||||
2025-08-08 09:46:14 INFO 任务:1233 END
|
||||
2025-08-08 09:46:14 INFO 总数 总用时 总字数 总列数 总大小
|
||||
2025-08-08 09:46:14 INFO 1/0 13.34 491 20 4725487
|
||||
|
||||
2025-08-08 10:34:30 INFO 任务:1 START 总数:1
|
||||
2025-08-08 10:34:30 INFO 压缩模式开启,最大像素: 2500
|
||||
2025-08-08 10:34:30 INFO 序号 用时 字数 列数 大小 宽度 高度 路径
|
||||
2025-08-08 10:34:38 INFO 1 7.90 481 53 1469 3075 6030 D:/Windos0/desktop/0003.jpg
|
||||
2025-08-08 10:34:38 INFO 任务:1 END
|
||||
2025-08-08 10:34:38 INFO 总数 总用时 总字数 总列数 总大小
|
||||
2025-08-08 10:34:38 INFO 1/0 7.92 481 53 1504099
|
||||
|
29
Log/OCR/2025-08-20.log
Normal file
29
Log/OCR/2025-08-20.log
Normal file
@ -0,0 +1,29 @@
|
||||
2025-08-20 14:02:33 INFO 任务:1 START 总数:1
|
||||
2025-08-20 14:02:33 INFO 序号 用时 字数 列数 大小 宽度 高度 路径
|
||||
2025-08-20 14:02:49 INFO 1 15.61 696 35 2290 889 1393 D:/Windos0/desktop/67997bf16a7c2e2f7489a6f97ae8a83e.png
|
||||
2025-08-20 14:02:49 INFO 任务:1 END
|
||||
2025-08-20 14:02:49 INFO 总数 总用时 总字数 总列数 总大小
|
||||
2025-08-20 14:02:49 INFO 1/0 15.69 696 35 2345247
|
||||
|
||||
2025-08-20 14:23:42 INFO 任务:11 START 总数:1
|
||||
2025-08-20 14:23:42 INFO 序号 用时 字数 列数 大小 宽度 高度 路径
|
||||
2025-08-20 14:23:47 WARNING 1 ocr failed D:/Important/software/软件交接/重要/重/籍智达OCR软件/籍智达OCR软件/jzd.0129/图/0003.JPG
|
||||
2025-08-20 14:23:47 INFO 任务:11 END
|
||||
2025-08-20 14:23:47 INFO 总数 总用时 总字数 总列数 总大小
|
||||
2025-08-20 14:23:47 INFO 0/1 5.04 0 0 0
|
||||
|
||||
2025-08-20 14:24:29 INFO 任务:11 START 总数:1
|
||||
2025-08-20 14:24:29 INFO 序号 用时 字数 列数 大小 宽度 高度 路径
|
||||
2025-08-20 14:24:34 WARNING 1 ocr failed D:/Windos0/desktop/0001/0001.png
|
||||
2025-08-20 14:24:34 INFO 任务:11 END
|
||||
2025-08-20 14:24:34 INFO 总数 总用时 总字数 总列数 总大小
|
||||
2025-08-20 14:24:34 INFO 0/1 5.04 0 0 0
|
||||
|
||||
2025-08-20 14:27:35 INFO 任务:1 START 总数:1
|
||||
2025-08-20 14:27:35 INFO 压缩模式开启,最大像素: 2500
|
||||
2025-08-20 14:27:35 INFO 序号 用时 字数 列数 大小 宽度 高度 路径
|
||||
2025-08-20 14:27:41 WARNING 1 ocr failed D:/Windos0/desktop/0001/0001.png
|
||||
2025-08-20 14:27:41 INFO 任务:1 END
|
||||
2025-08-20 14:27:41 INFO 总数 总用时 总字数 总列数 总大小
|
||||
2025-08-20 14:27:41 INFO 0/1 5.85 0 0 0
|
||||
|
0
Log/fail/ocr/1.txt
Normal file
0
Log/fail/ocr/1.txt
Normal file
1
Log/fail/ocr/11.txt
Normal file
1
Log/fail/ocr/11.txt
Normal file
@ -0,0 +1 @@
|
||||
D:/Windos0/desktop/0001/0001.png
|
0
Log/fail/ocr/1233.txt
Normal file
0
Log/fail/ocr/1233.txt
Normal file
20
config.json
Normal file
20
config.json
Normal file
@ -0,0 +1,20 @@
|
||||
{
|
||||
"login_url": "https://gj.cool/ocr_login",
|
||||
"apiid": "5be9a4ac35015a478425ba2d212545",
|
||||
"token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJmcmVzaCI6dHJ1ZSwiaWF0IjoxNzU1NjcxMDA4LCJqdGkiOiJmYjE4Mzc1Yy0yYmQwLTQ3MDUtOTg1Zi0yYmMyOGU0NTQ0NWQiLCJ0eXBlIjoiYWNjZXNzIiwic3ViIjoibWtHdFh0OW5NTUtQcGpuQ2daSXV6dz09XG4iLCJuYmYiOjE3NTU2NzEwMDgsImV4cCI6MTc2MzQ0NzAwOH0.UU441ZZVpQ5gxRHhWDuhUn1X8ToOMjN_kYH-5fuMWA6ZKfJTFWu29jzXrWZgSFbLmzONNrhrMPpg5bTPHALl7UJrs6yJO5G6Q264vAO4kwDyjLdrl55BGoh2DO_Lq3NWd8LvKfn64TIG620fJJuItNARczOO_TGUjLtBrSMRx30",
|
||||
"server": "web",
|
||||
"server_lst": {
|
||||
"web": "https://ap2.jzd.cool:9043",
|
||||
"local": "http://117.72.92.55:9012"
|
||||
},
|
||||
"local_head": "http://",
|
||||
"local_port": 9012,
|
||||
"max_size": 70,
|
||||
"interval": 0,
|
||||
"punct_max_length": 100000,
|
||||
"punct_detect_encoding": true,
|
||||
"punct_default_encoding": "utf-8",
|
||||
"timeout_connect": 15,
|
||||
"timeout_read": 300,
|
||||
"retry_time": 3
|
||||
}
|
1413
jzd_main.py
Normal file
1413
jzd_main.py
Normal file
File diff suppressed because it is too large
Load Diff
368
jzd_ocr.py
Normal file
368
jzd_ocr.py
Normal file
@ -0,0 +1,368 @@
|
||||
# last updata: 2025-07-13
|
||||
# 下载离线安装包
|
||||
# mkdir lib
|
||||
# pip download requests whatimage tqdm opencv-python -d lib
|
||||
# 离线安装
|
||||
# 1. 解压/lib (python>=3.9) ;或解压/lib38 (python=3.8)
|
||||
# 2. pip install --no-index --find-links=lib requests whatimage tqdm opencv-python
|
||||
# 在线安装
|
||||
# pip install requests whatimage tqdm
|
||||
|
||||
import json,logging,time,os
|
||||
from pathlib import Path
|
||||
from tkinter import filedialog, Tk
|
||||
import requests, whatimage
|
||||
from tqdm import tqdm
|
||||
from tempfile import NamedTemporaryFile
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
Start_dir = Path(__file__).parent
|
||||
ConfigFile = Start_dir / 'ocr_config.json'
|
||||
AuthFile = Start_dir / 'ocr_auth.json'
|
||||
|
||||
Log_dir = Start_dir / 'Log'
|
||||
Data_dir = Start_dir / 'Data'
|
||||
Json_Data_dir = Data_dir / 'json'
|
||||
Text_Data_dir = Data_dir / 'text'
|
||||
Fail_dir = Log_dir / 'fail'
|
||||
Fail_OCR_dir = Fail_dir / 'ocr'
|
||||
|
||||
Log_dir.mkdir(exist_ok=True)
|
||||
Data_dir.mkdir(exist_ok=True)
|
||||
Json_Data_dir.mkdir(exist_ok=True)
|
||||
Text_Data_dir.mkdir(exist_ok=True)
|
||||
Fail_dir.mkdir(exist_ok=True)
|
||||
Fail_OCR_dir.mkdir(exist_ok=True)
|
||||
|
||||
TimeStampStr = '%Y-%m-%d_%H.%M.%S'
|
||||
|
||||
def get_timestamp(time_stamp_format=TimeStampStr):
|
||||
return time.strftime(time_stamp_format, time.localtime())
|
||||
|
||||
|
||||
def data_to_text(data):
|
||||
result = ''
|
||||
if data.get('line_ids') is not None and data.get('chars') is not None:
|
||||
for i, (id_i, char_i) in enumerate(zip(data['line_ids'], data['chars'])):
|
||||
#处理非结尾的字符
|
||||
if i < len(data['line_ids'])-1 and id_i==data['line_ids'][i+1]:
|
||||
result+=char_i
|
||||
#处理结尾处的字符
|
||||
else:
|
||||
result+=char_i+'\n'
|
||||
return result
|
||||
|
||||
|
||||
def resize_image(img_path, max_length:int):
|
||||
import cv2
|
||||
file_path_gbk = str(img_path).encode('gbk')
|
||||
img = cv2.imread(file_path_gbk.decode())
|
||||
# img = cv2.imread(str(img_path))
|
||||
|
||||
height, width = img.shape[:2]
|
||||
resize_factor:float = max(height, width) / max_length if max(height, width) > max_length > 0 else 1.0
|
||||
if resize_factor>1:
|
||||
img = cv2.resize(img, (round(width/resize_factor), round(height/resize_factor)))
|
||||
_, buffer = cv2.imencode('.jpeg', img) # 编码为JPEG字节流
|
||||
img_bytes = BytesIO(buffer).getvalue() # 获取字节数据
|
||||
return img_bytes, resize_factor
|
||||
|
||||
|
||||
|
||||
def resize_data(data, resize_factor:float):
|
||||
if resize_factor > 1:
|
||||
if data.get('Width') is not None and data.get('Height') is not None:
|
||||
data['Width'] = round(data['Width'] * resize_factor)
|
||||
data['Height'] = round(data['Height'] * resize_factor)
|
||||
if data.get('coors') is not None and type(data['coors'])==list:
|
||||
data['coors'] = [[round(x*resize_factor) for x in coor] for coor in data['coors'] if type(coor)==list]
|
||||
return data
|
||||
|
||||
def api_ocr_pro(img_path, void_value, auth_dict, config):
|
||||
try:
|
||||
access_token = auth_dict['token']
|
||||
connect_timeout = config['timeout_connect']
|
||||
read_timeout = config['timeout_read']
|
||||
retry_times = config['retry_time']
|
||||
server_type = config['server']
|
||||
ocr_type = config['ocr_type']
|
||||
|
||||
max_length = config['max_length']
|
||||
|
||||
url = config['server_lst'][server_type] + f'/{ocr_type}'
|
||||
headers = {'Authorization': f'gjcool {access_token}'}
|
||||
|
||||
img_name = Path(img_path).name
|
||||
mime = get_mime(img_path)
|
||||
|
||||
if max_length == 0:
|
||||
files = [('img', (img_name, open(img_path, 'rb'), mime))]
|
||||
resize_factor = 1.0
|
||||
else:
|
||||
img_bytes, resize_factor = resize_image(img_path, max_length)
|
||||
files = [('img', (img_name, img_bytes, mime))]
|
||||
data = {}
|
||||
i = 0
|
||||
while i<retry_times:
|
||||
try:
|
||||
response = requests.post(url, headers=headers, data=data, files=files, timeout=(connect_timeout, read_timeout))
|
||||
break
|
||||
except requests.exceptions.RequestException as e:
|
||||
i+=1
|
||||
print(f'retry {i} times')
|
||||
print(e)
|
||||
|
||||
if i>=retry_times or response is None:
|
||||
return void_value
|
||||
else:
|
||||
result = response.json()
|
||||
if result.get('msg') is None and result.get('detail') is None:
|
||||
result = resize_data(result, resize_factor)
|
||||
return result
|
||||
else:
|
||||
print(result)
|
||||
return void_value
|
||||
except:
|
||||
print('ocr_pro failed')
|
||||
return void_value
|
||||
|
||||
|
||||
def batch_ocr_api(path_lst, task_name, auth_dict, config): #layout, compact, area_num, row_num, , anno_open:bool=True
|
||||
logging.info(f'\t\t任务:{task_name}\t\tSTART\t\t总数:{len(path_lst)}')
|
||||
logging.info(f'\t\t序号\t用时\t字数\t列数\t大小\t宽度\t高度\t路径')
|
||||
|
||||
#初始化记录变量
|
||||
total_info = {'TimeCost':0,'CharNumber':0, 'LineNumber':0, 'ImageSize':0, 'SuccessNumber':0, 'FailNumber':0}
|
||||
fail_list_path = str(Fail_OCR_dir.joinpath(f'{task_name}.txt'))
|
||||
save_text(fail_list_path, "", False)
|
||||
|
||||
start_time = time.time()
|
||||
index = 0
|
||||
for path_dict in tqdm(path_lst, desc="OCR"):
|
||||
now_api_time = time.time()
|
||||
|
||||
data = api_ocr_pro(path_dict['img_path'], {}, auth_dict, config)
|
||||
last_api_time = time.time()
|
||||
if data=={}:
|
||||
logging.warning(f"\t{index+1:<5d}\tocr failed\t{path_dict['img_path']}")
|
||||
save_text(fail_list_path, f"{path_dict['img_path']}\n", True)
|
||||
total_info['FailNumber'] += 1
|
||||
else:
|
||||
try:
|
||||
with open(path_dict['json_path'], "w", encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False)
|
||||
|
||||
text = data.get('text', data_to_text(data))
|
||||
|
||||
with open(path_dict['text_path'], "w", encoding='utf-8') as f:
|
||||
f.write(text)
|
||||
|
||||
#序号、用时、字数、列数、大小、宽度、高度、路径
|
||||
img_size = round(data['Size']/1024) #KB
|
||||
time_cost= last_api_time - now_api_time #s
|
||||
logging.info(f"\t\t{index+1:<6d}\t{time_cost:.2f}\t{data['CharNumber']:<6d}\t{data['LineNumber']:<6d}\t{img_size:<6d}\t{data['Width']:<6d}\t{data['Height']:<6d}\t{path_dict['img_path']}")
|
||||
|
||||
total_info['TimeCost'] += time_cost
|
||||
total_info['CharNumber'] += data['CharNumber']
|
||||
total_info['LineNumber'] += data['LineNumber']
|
||||
total_info['ImageSize'] += data['Size']
|
||||
total_info['SuccessNumber'] += 1
|
||||
except:
|
||||
logging.warning(f"\t\t{index+1:<6d}\tsave data wrong\t{path_dict['img_path']}")
|
||||
save_text(fail_list_path, f"{path_dict['img_path']}\n", True)
|
||||
total_info['FailNumber'] += 1
|
||||
|
||||
index += 1
|
||||
|
||||
logging.info(f"\t\t任务:{task_name}\t\tEND")
|
||||
logging.info(f"\t\t总数\t总用时\t总字数\t总列数\t总大小")
|
||||
logging.info(f"\t\t{total_info['SuccessNumber']}/{total_info['FailNumber']} \t{time.time()-start_time:.2f}\t{total_info['CharNumber']:<6d}\t{total_info['LineNumber']:<6d}\t{total_info['ImageSize']:<6d}\n")
|
||||
|
||||
|
||||
|
||||
def get_allfile_alldir_in_dir(path):
|
||||
alldir_path =[]
|
||||
allfile_path=[]
|
||||
path_tuple = os.walk(path)
|
||||
|
||||
for dirpath, dirnames, filenames in path_tuple:
|
||||
for dir in dirnames:
|
||||
alldir_path.append(os.path.join(dirpath, dir))
|
||||
|
||||
for f in filenames:
|
||||
allfile_path.append(os.path.join(dirpath, f))
|
||||
|
||||
alldir_path = sorted(alldir_path)
|
||||
allfile_path = sorted(allfile_path)
|
||||
return alldir_path, allfile_path
|
||||
|
||||
|
||||
def get_token_by_login(apiid, password, url):
|
||||
try:
|
||||
payload = {'apiid':apiid, 'password':password, 'encrypt':1, 'is_long':1}
|
||||
response = requests.post(url, data=payload).json()
|
||||
token = response['access_token']
|
||||
except:
|
||||
token = ''
|
||||
return token
|
||||
|
||||
|
||||
def get_mime(img_path):
|
||||
with open(img_path, 'rb') as f:
|
||||
img = f.read()
|
||||
mime_type = whatimage.identify_image(img)
|
||||
if mime_type is None or mime_type=='None':
|
||||
mime_type = Path(img_path).suffix.replace('.', '')
|
||||
return f'image/{mime_type}'
|
||||
|
||||
|
||||
|
||||
def load_config(config_path):
|
||||
try:
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except:
|
||||
print('配置文件读取失败')
|
||||
return None
|
||||
|
||||
|
||||
def logging_init(log_type:str, dir:Path=Log_dir, level=logging.INFO):
|
||||
'''
|
||||
初始化日志记录器
|
||||
'''
|
||||
log_dir = dir / log_type
|
||||
log_dir.mkdir(exist_ok=True)
|
||||
|
||||
log_filepath = log_dir / (time.strftime("%Y-%m-%d", time.localtime()) + '.log')
|
||||
logging.basicConfig(
|
||||
filename=str(log_filepath),
|
||||
level=level,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding='utf-8'
|
||||
)
|
||||
|
||||
|
||||
def prepare_ocr_dir_task_paths(dir, task_name, max_size):
|
||||
#创建目录
|
||||
json_save_dir = Json_Data_dir.joinpath(task_name)
|
||||
json_save_dir.mkdir(exist_ok=True)
|
||||
text_save_dir = Text_Data_dir.joinpath(task_name)
|
||||
text_save_dir.mkdir(exist_ok=True)
|
||||
|
||||
alldir_path, allfile_path = get_allfile_alldir_in_dir(dir)
|
||||
for dir_path in alldir_path:
|
||||
data_dir = Path(str(dir_path).replace(dir, str(json_save_dir)))
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
text_dir = Path(str(dir_path).replace(dir, str(text_save_dir)))
|
||||
text_dir.mkdir(exist_ok=True)
|
||||
|
||||
path_lst, fail_lst =[], []
|
||||
for file_path in allfile_path:
|
||||
if os.path.getsize(file_path) < max_size:
|
||||
filename = Path(file_path).stem
|
||||
json_dir = Path(str(file_path).replace(dir, str(json_save_dir))).parent
|
||||
text_dir = Path(str(file_path).replace(dir, str(text_save_dir))).parent
|
||||
path_dict = {'img_path':file_path, 'json_path':str(json_dir.joinpath(f'{filename}.json')), 'text_path':str(text_dir.joinpath(f'{filename}.txt'))}
|
||||
path_lst.append(path_dict)
|
||||
else:
|
||||
print(f'{file_path}体积过大, {os.path.getsize(file_path)/1024/1024}MB, 超过最大限量{max_size/1024/1024}MB')
|
||||
fail_lst.append(file_path)
|
||||
return path_lst, fail_lst
|
||||
|
||||
def prepare_ocr_files_task_paths(paths, task_name, max_size):
|
||||
json_save_dir = Json_Data_dir.joinpath(task_name)
|
||||
json_save_dir.mkdir(exist_ok=True)
|
||||
text_save_dir = Text_Data_dir.joinpath(task_name)
|
||||
text_save_dir.mkdir(exist_ok=True)
|
||||
|
||||
path_lst, fail_lst =[], []
|
||||
for file_path in paths:
|
||||
if os.path.getsize(file_path) < max_size:
|
||||
filename = Path(file_path).stem
|
||||
path_lst.append({'img_path':file_path, 'json_path':str(json_save_dir.joinpath(f'{filename}.json')), 'text_path':str(text_save_dir.joinpath(f'{filename}.txt'))})
|
||||
else:
|
||||
print(f'{file_path}体积过大, {os.path.getsize(file_path)/1024/1024}MB, 超过最大限量{max_size/1024/1024}MB')
|
||||
fail_lst.append(file_path)
|
||||
return path_lst, fail_lst
|
||||
|
||||
|
||||
def prepare_ocr_list_task_paths(list_paths, task_name, max_size):
|
||||
img_paths = []
|
||||
for lst_path in list_paths:
|
||||
with open(lst_path, 'r',encoding='utf-8') as f:
|
||||
for line in f.readlines():
|
||||
img_path = line.strip()
|
||||
if Path(img_path).exists():
|
||||
img_paths.append(img_path)
|
||||
|
||||
path_lst, fail_lst = prepare_ocr_files_task_paths(img_paths, task_name, max_size)
|
||||
|
||||
return path_lst, fail_lst
|
||||
|
||||
def read_paths(pathtype='file', init_dir='./'):
|
||||
root = Tk()
|
||||
root.focus_force()
|
||||
root.after(10, root.withdraw)
|
||||
if pathtype == 'file':
|
||||
return filedialog.askopenfilenames(parent=root, initialdir=init_dir)
|
||||
elif pathtype == 'dir':
|
||||
return filedialog.askdirectory(parent=root, initialdir=init_dir)
|
||||
|
||||
def save_text(filepath, content, is_add=False):
|
||||
if not filepath: return
|
||||
with open(filepath, "a" if is_add else "w",encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging_init('OCR')
|
||||
sub_key = input('选择图片: 1.目录; 2.文件; 3.列表. 输入其他键, 返回上层\t')
|
||||
while sub_key in ['1', '2', '3']:
|
||||
task_name = input(f'请输入任务名称. 默认取当前日期时间({TimeStampStr}):\t')
|
||||
if not task_name:
|
||||
task_name = get_timestamp(TimeStampStr)
|
||||
|
||||
auth_dict = load_config(AuthFile)
|
||||
config = load_config(ConfigFile)
|
||||
max_size = config['max_size'] * 1024 * 1024
|
||||
|
||||
#图片列表:path_lst,fail_lst
|
||||
if sub_key in ['1']:
|
||||
print('请选择图片目录')
|
||||
dir = read_paths(pathtype='dir', init_dir=str(Start_dir))
|
||||
if not dir:
|
||||
break
|
||||
print(dir)
|
||||
path_lst, fail_lst = prepare_ocr_dir_task_paths(dir, task_name, max_size)
|
||||
elif sub_key in ['2']:
|
||||
print('请选择图片文件')
|
||||
img_paths = read_paths(init_dir=str(Start_dir))
|
||||
if not img_paths:
|
||||
break
|
||||
print(f'已选择{len(img_paths)}个文件')
|
||||
path_lst, fail_lst = prepare_ocr_files_task_paths(img_paths, task_name, max_size)
|
||||
elif sub_key in ['3']:
|
||||
print('请选择列表文件')
|
||||
list_paths = read_paths(init_dir=str(Start_dir))
|
||||
if not list_paths:
|
||||
break
|
||||
print(f'已选择{len(list_paths)}个列表')
|
||||
path_lst, fail_lst = prepare_ocr_list_task_paths(list_paths, task_name, max_size)
|
||||
|
||||
#path_lst,task_name, url, fail_lst
|
||||
if len(fail_lst)>0:
|
||||
check_size = input(f'有{len(fail_lst)}个文件体积超标, 是否停止任务: 1. 继续; 其他, 中止\t')
|
||||
if check_size not in ['1']:
|
||||
break
|
||||
|
||||
try:
|
||||
batch_ocr_api(path_lst, task_name, auth_dict, config)
|
||||
except:
|
||||
print(f'{task_name}任务失败')
|
||||
|
||||
sub_key = input('选择方式: 1.目录; 2.文件; 3.列表. 输入其他键, 返回上层\t')
|
||||
|
||||
|
3
password_encrypt.txt
Normal file
3
password_encrypt.txt
Normal file
@ -0,0 +1,3 @@
|
||||
c4tW323xNjzh+hf6cR0KddW25iUhY3JpYcBXQF1PJ73zvgtqIB/R7pRlHoDTY5aFDv7N8L0EfqyM
|
||||
Eskb49WGiI3kAWhyo+sfUZ7ZgwC7cXWdma3b3oenL+RK/3ZWOKOP/Uxjunm8ZTGB4huHHFlkZXJW
|
||||
HXRZLJm3nJ6oxdLr/ck=
|
5
password_pubkey.pem
Normal file
5
password_pubkey.pem
Normal file
@ -0,0 +1,5 @@
|
||||
-----BEGIN RSA PUBLIC KEY-----
|
||||
MIGJAoGBAIhu/jI4yZqa4+Yyh7qN52YH4Y0xohg5no4+w1N15y0oSeNuCE2V88eR
|
||||
oXg+EwmY6RsHmTvHE/OnvM44sqfEFVucSwJDj0cfUTlQqkKzf+YukKfaQx6syiBn
|
||||
2LzXH4LOP+dLho4rzD32HwM5rPtQ7nWtdUr7PrbUyjGeBsLI+J2LAgMBAAE=
|
||||
-----END RSA PUBLIC KEY-----
|
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@ -0,0 +1,8 @@
|
||||
requests
|
||||
whatimage
|
||||
rsa
|
||||
tqdm
|
||||
chardet
|
||||
PyPDF2[full]
|
||||
pdf2image
|
||||
pillow
|
BIN
图/0003.JPG
Normal file
BIN
图/0003.JPG
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.5 MiB |
BIN
图/ocrfailed/110613020250003/0004/0024A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0004/0024A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0004A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0004A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0012A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0012A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0013A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0013A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0024A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0024A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0027A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0027A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0032A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0032A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0039A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0039A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0047A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0047A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0049A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0049A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0006/0054A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0006/0054A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0008/0027A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0008/0027A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0012/0005A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0012/0005A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0012/0019A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0012/0019A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0012/0022A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0012/0022A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0013/0004A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0013/0004A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0013/0021A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0013/0021A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0013/0031A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0013/0031A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0013/0057A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0013/0057A.jp2
Normal file
Binary file not shown.
BIN
图/ocrfailed/110613020250003/0013/0062A.jp2
Normal file
BIN
图/ocrfailed/110613020250003/0013/0062A.jp2
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user