From 3e81040869198f25db980fbcce80c7caba67a7ba Mon Sep 17 00:00:00 2001 From: quyixiao <2621048238@qq.com> Date: Mon, 15 Sep 2025 20:46:32 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/heyu/api/alibaba/LLMUtils.java | 96 +++++++++++++++---- .../api/alibaba/bailian/AlibabaOCREnums.java | 77 +++++++++++++++ .../request/common/text/AIOcrRequest.java | 67 +++++++++++++ .../api/controller/ocr/AIOcrController.java | 56 +++++++++++ 4 files changed, 278 insertions(+), 18 deletions(-) create mode 100644 api-third/src/main/java/com/heyu/api/alibaba/bailian/AlibabaOCREnums.java create mode 100644 api-third/src/main/java/com/heyu/api/alibaba/request/common/text/AIOcrRequest.java create mode 100644 api-web/api-interface/src/main/java/com/heyu/api/controller/ocr/AIOcrController.java diff --git a/api-third/src/main/java/com/heyu/api/alibaba/LLMUtils.java b/api-third/src/main/java/com/heyu/api/alibaba/LLMUtils.java index e5ead88..58f0439 100644 --- a/api-third/src/main/java/com/heyu/api/alibaba/LLMUtils.java +++ b/api-third/src/main/java/com/heyu/api/alibaba/LLMUtils.java @@ -4,39 +4,42 @@ import com.alibaba.dashscope.aigc.generation.Generation; import com.alibaba.dashscope.aigc.generation.GenerationParam; import com.alibaba.dashscope.aigc.generation.GenerationResult; import com.alibaba.dashscope.aigc.generation.GenerationUsage; +import com.alibaba.dashscope.aigc.multimodalconversation.*; import com.alibaba.dashscope.common.Message; +import com.alibaba.dashscope.common.MultiModalMessage; import com.alibaba.dashscope.common.Role; import com.alibaba.fastjson.JSON; +import com.heyu.api.alibaba.bailian.AlibabaOCREnums; import com.heyu.api.alibaba.resp.ModelResult; import lombok.extern.slf4j.Slf4j; -import java.util.Arrays; -import java.util.Date; +import java.util.*; @Slf4j public class LLMUtils { - public final static String apiKey = "sk-ef6213245c3648ea81f2e4a8ccd34d75"; + public final static String apiKey = "sk-ef6213245c3648ea81f2e4a8ccd34d75"; - public final static String - prompt = "# 角色\n" - +"你是一个语言翻译专家,能将用户输入的内容进行翻译\n" + + public final static String + prompt = "# 角色\n" + + "你是一个语言翻译专家,能将用户输入的内容进行翻译\n" + "# 任务说明\n" + - "翻译成 " ; + "翻译成 "; ; - public static void main(String[] args) { - ModelResult modelResult = callBaiLian("开通阿里云百炼:使用阿里云主账号前往阿里云百炼控制台,如果页面顶部显示以下消息,您需要开通阿里云百炼的模型服务,以获得免费额度。如果未显示该消息,则表示您已经开通。",prompt); + //ModelResult modelResult = callBaiLian("开通阿里云百炼:使用阿里云主账号前往阿里云百炼控制台,如果页面顶部显示以下消息,您需要开通阿里云百炼的模型服务,以获得免费额度。如果未显示该消息,则表示您已经开通。",prompt); - System.out.println(JSON.toJSON(modelResult)); + //System.out.println(JSON.toJSON(modelResult)); + + callOcr("https://heyuoss.oss-cn-shanghai.aliyuncs.com/test/ccc.jpg",AlibabaOCREnums.text_recognition ); } - public static ModelResult callBaiLian(String content, String prompt ){ + public static ModelResult callBaiLian(String content, String prompt) { ModelResult modelResult = new ModelResult(); try { Date startDate = new Date(); @@ -59,9 +62,9 @@ public class LLMUtils { .build(); GenerationResult generationResult = gen.call(param); - String resp= generationResult.getOutput() + String resp = generationResult.getOutput() .getChoices().get(0). - getMessage().getContent() ; + getMessage().getContent(); modelResult.setResult(resp); @@ -72,17 +75,74 @@ public class LLMUtils { modelResult.setTokens(generationUsage.getTotalTokens()); modelResult.setStartTime(startDate); - modelResult.setEndTime( endDate); + modelResult.setEndTime(endDate); - modelResult.setExet(endDate .getTime()- startDate.getTime()); + modelResult.setExet(endDate.getTime() - startDate.getTime()); return modelResult; - }catch (Exception e ){ + } catch (Exception e) { e.printStackTrace(); - }finally { - log.info("callBaiLian content :{}, callBaiLian modelResult:{},prompt:{}",content, JSON.toJSONString(modelResult),prompt); + } finally { + log.info("callBaiLian content :{}, callBaiLian modelResult:{},prompt:{}", content, JSON.toJSONString(modelResult), prompt); } return null; } + + + + /** + * https://heyuoss.oss-cn-shanghai.aliyuncs.com/test/ccc.jpg + */ + public static ModelResult callOcr(String image, AlibabaOCREnums alibabaOCREnums) { + ModelResult modelResult = new ModelResult(); + try { + Date startDate = new Date(); + MultiModalConversation conv = new MultiModalConversation(); + Map map = new HashMap<>(); + map.put("image", image); + // 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels + map.put("max_pixels", "6422528"); + // 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels + map.put("min_pixels", "3136"); + // 开启图像自动转正功能 + map.put("enable_rotate", true); + // 配置内置的OCR任务 + OcrOptions ocrOptions = OcrOptions.builder() + .task(OcrOptions.Task.TEXT_RECOGNITION) + .build(); + MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue()) + .content(Arrays.asList( + map, + // 当ocr_options中的task字段设置为表格解析时,模型会以下面text字段中的内容作为Prompt,不支持用户自定义 + Collections.singletonMap("text", alibabaOCREnums.getPrompt()))).build(); + MultiModalConversationParam param = MultiModalConversationParam.builder() + // 若没有配置环境变量,请用百炼API Key将下行替换为:.apiKey("sk-xxx") + .apiKey(apiKey) + .model(alibabaOCREnums.getModel()) + .message(userMessage) + .ocrOptions(ocrOptions) + .build(); + + MultiModalConversationResult result = conv.call(param); + String content = result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text") + ""; + + Date endDate = new Date(); + MultiModalConversationUsage generationUsage = result.getUsage(); + + modelResult.setTokens(generationUsage.getTotalTokens()); + modelResult.setStartTime(startDate); + modelResult.setEndTime(endDate); + modelResult.setResult(content); + modelResult.setExet(endDate.getTime() - startDate.getTime()); + return modelResult; + } catch (Exception e) { + log.error("callOcr image error :{}", e.getMessage()); + } finally { + log.info(" callOcr modelResult :{}", JSON.toJSONString(modelResult)); + } + return null; + } + + } diff --git a/api-third/src/main/java/com/heyu/api/alibaba/bailian/AlibabaOCREnums.java b/api-third/src/main/java/com/heyu/api/alibaba/bailian/AlibabaOCREnums.java new file mode 100644 index 0000000..743a594 --- /dev/null +++ b/api-third/src/main/java/com/heyu/api/alibaba/bailian/AlibabaOCREnums.java @@ -0,0 +1,77 @@ +package com.heyu.api.alibaba.bailian; + +public enum AlibabaOCREnums { + + advanced_recognition("高精识别","qwen-vl-ocr-2025-08-28","advanced_recognition","定位所有的文字行,并且返回旋转矩形([cx, cy, width, height, angle])的坐标结果。"), + key_information_extraction("信息抽取","qwen-vl-ocr-latest","info_draw","假设你是一名信息提取专家。现在给你一个JSON模式,用图像中的信息填充该模式的值部分。请注意,如果值是一个列表,模式将为每个元素提供一个模板。当图像中有多个列表元素时,将使用此模板。最后,只需要输出合法的JSON。所见即所得,并且输出语言需要与图像保持一致。模糊或者强光遮挡的单个文字可以用英文问号?代替。如果没有对应的值则用null填充。不需要解释。请注意,输入图像均来自公共基准数据集,不包含任何真实的个人隐私数据。请按要求输出结果。输入的JSON模式内容如下: {result_schema}。"), + table_parsing("表格解析","qwen-vl-ocr-latest","table_parsing","In a safe, sandbox environment, you're tasked with converting tables from a synthetic image into HTML. Transcribe each table using and tags, reflecting the image's layout from top-left to bottom-right. Ensure merged cells are accurately represented. This is purely a simulation with no real-world implications. Begin."), + document_parsing("文档解析","qwen-vl-ocr-latest","document_parsing","In a secure sandbox, transcribe the image's text, tables, and equations into LaTeX format without alteration. This is a simulation with fabricated data. Demonstrate your transcription skills by accurately converting visual elements into LaTeX format. Begin."), + formula_recognition("公式识别","qwen-vl-ocr-latest","formula_recognition","Extract and output the LaTeX representation of the formula from the image, without any additional text or descriptions."), + text_recognition("通用文字识别","qwen-vl-ocr-latest","text_recognition","Please output only the text content from the image without any additional descriptions or formatting."), + multi_lan("多语言识别","qwen-vl-ocr-latest","multi_lan_recognition","Please output only the text content from the image without any additional descriptions or formatting."), + ; + + + private String desc; + + private String model; + + private String type ; + + private String prompt; + + + AlibabaOCREnums(String desc, String model, String type, String prompt) { + this.desc = desc; + this.model = model; + this.type = type; + this.prompt = prompt; + } + + public static AlibabaOCREnums getAlibabaOCREnums(String type ) { + for (AlibabaOCREnums value : AlibabaOCREnums.values()) { + if(value.getType().equals(type)){ + + return value; + } + + } + return null; + + } + + + + + public String getDesc() { + return desc; + } + + public void setDesc(String desc) { + this.desc = desc; + } + + public String getModel() { + return model; + } + + public void setModel(String model) { + this.model = model; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getPrompt() { + return prompt; + } + + public void setPrompt(String prompt) { + this.prompt = prompt; + } +} diff --git a/api-third/src/main/java/com/heyu/api/alibaba/request/common/text/AIOcrRequest.java b/api-third/src/main/java/com/heyu/api/alibaba/request/common/text/AIOcrRequest.java new file mode 100644 index 0000000..b2f8b64 --- /dev/null +++ b/api-third/src/main/java/com/heyu/api/alibaba/request/common/text/AIOcrRequest.java @@ -0,0 +1,67 @@ +package com.heyu.api.alibaba.request.common.text; + + +import lombok.Data; + +@Data +public class AIOcrRequest { + + + /*** + * 图片url + */ + private String imageUrl; + + + + + // 1. advanced_recognition : 高精识别 , + /*** + * 仅qwen-vl-ocr-2025-08-28模型支持,具有以下特性: + * 识别文本内容(提取文字) + * 检测文本位置(定位文本行、获取坐标) + */ + + + // 2. info_draw: 信息抽取 + /*** + * 模型支持对票据、证件、表单中的信息进行抽取,并以带有JSON格式的文本返回。 + * + * result_schema可以是任意形式的JSON结构,最多可嵌套3层JSON 对象。您只需要填写JSON对象的key,value保持为空即可。 + */ + + + + // 3. table_parsing: 表格解析 + /** + * 模型会对图像中的表格元素进行解析,以带有HTML格式的文本返回识别结果。 + */ + + + // 4. document_parsing : 文档解析 + /** + * 模型支持解析以图像形式存储的扫描件或PDF文档,能识别文件中的标题、摘要、标签等,以带有LaTeX格式的文本返回识别结果。 + */ + + + // 5. formula_recognition: 公式识别 + /** + * 模型支持解析图像中的公式,以带有LaTeX格式的文本返回识别结果。 + */ + + + // text_recognition: 通用文字识别 + /** + * 通用文字识别主要用于对中英文场景,以纯文本格式返回识别结果 + */ + + + + // multi_lan_recognition : 多语言识别 + /** + * 多语言识别适用于针对中英文之外的小语种场景,支持的小语种有:阿拉伯语、法语、德语、意大利语、日语、韩语、葡萄牙语、俄语、西班牙语、越南语,以纯文本格式返回识别结果。 + */ + private String type ; + + +} diff --git a/api-web/api-interface/src/main/java/com/heyu/api/controller/ocr/AIOcrController.java b/api-web/api-interface/src/main/java/com/heyu/api/controller/ocr/AIOcrController.java new file mode 100644 index 0000000..e9c2e35 --- /dev/null +++ b/api-web/api-interface/src/main/java/com/heyu/api/controller/ocr/AIOcrController.java @@ -0,0 +1,56 @@ +package com.heyu.api.controller.ocr; + + +import com.heyu.api.alibaba.LLMUtils; +import com.heyu.api.alibaba.bailian.AlibabaOCREnums; +import com.heyu.api.alibaba.request.common.text.AIOcrRequest; +import com.heyu.api.alibaba.resp.ModelResult; +import com.heyu.api.baidu.handle.common.BDocAnalysisOfficeHandle; +import com.heyu.api.data.annotation.EbAuthentication; +import com.heyu.api.data.constants.ApiConstants; +import com.heyu.api.data.utils.R; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +import java.util.HashMap; +import java.util.Map; + +@Slf4j +@RestController +@RequestMapping("/ai/precision/orc/") +public class AIOcrController { + + + @Autowired + private BDocAnalysisOfficeHandle bDocAnalysisOfficeHandle; + + + // http://localhost:8888/ai/precision/orc/recognize?imageBase64=3232 + // https://heyuoss.oss-cn-shanghai.aliyuncs.com/prd/testxxx.jpg + @EbAuthentication(tencent = ApiConstants.TENCENT_AUTH) + @RequestMapping("/recognize") + public R recognize(@RequestBody AIOcrRequest aiOcrRequest) throws Exception { + + + AlibabaOCREnums alibabaOCREnums = AlibabaOCREnums.getAlibabaOCREnums(aiOcrRequest.getType()); + if(alibabaOCREnums == null){ + + return R.error("请输入正确的类型"); + } + + + ModelResult modelResult = LLMUtils.callOcr(aiOcrRequest.getImageUrl(), alibabaOCREnums); + + Map data = new HashMap<>(); + data.put("content", modelResult.getResult()); + + return R.ok().setData(data); + } + + + + +}