فهرست منبع

处理上传时图片文件OCR提取文字存入es库。

wukai 2 سال پیش
والد
کامیت
cb5108d410

+ 2 - 0
doc-admin/src/main/java/com/doc/RuoYiApplication.java

@@ -3,6 +3,7 @@ package com.doc;
 import org.springframework.boot.SpringApplication;
 import org.springframework.boot.autoconfigure.SpringBootApplication;
 import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
+import org.springframework.scheduling.annotation.EnableAsync;
 
 /**
  * 启动程序
@@ -10,6 +11,7 @@ import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
  * @author ruoyi
  */
 @SpringBootApplication(exclude = {DataSourceAutoConfiguration.class})
+@EnableAsync
 public class RuoYiApplication {
     public static void main(String[] args) {
         // System.setProperty("spring.devtools.restart.enabled", "false");

+ 94 - 0
doc-admin/src/test/java/com/test/SumJavaCode.java

@@ -0,0 +1,94 @@
+package com.test;
+
+import java.io.*;
+
+/**
+ * 统计代码行数
+ */
+public class SumJavaCode {
+
+    static long normalLines = 0; // 空行
+    static long commentLines = 0; // 注释行
+    static long whiteLines = 0; // 代码行
+
+    public static void main(String[] args) {
+
+        SumJavaCode sjc = new SumJavaCode();
+        File f = new File("D:\\code\\2023\\doc-ui\\src"); //目录
+        System.out.println(f.getName());
+        sjc.treeFile(f);
+        System.out.println("空行:" + whiteLines);
+        System.out.println("注释行:" + commentLines);
+        System.out.println("代码行:" + normalLines);
+    }
+
+    /**
+     *   * 查找出一个目录下所有的.java ,.xml,.vue,.js,.html 文件
+     *   *
+     *   * @param f 要查找的目录
+     */
+    private void treeFile(File f) {
+        File[] childs = f.listFiles();
+        for (int i = 0; i < childs.length; i++) {
+            if (!childs[i].isDirectory()) {
+                if (childs[i].getName().matches(".*\\.java$") || childs[i].getName().matches(".*\\.xml$") ||
+                        childs[i].getName().matches(".*\\.vue$") || childs[i].getName().matches(".*\\.js$") ||
+                        childs[i].getName().matches(".*\\.html$")) {
+                    System.out.println(childs[i].getName());
+                    sumCode(childs[i]);
+                }
+            } else {
+                treeFile(childs[i]);
+            }
+        }
+    }
+
+    /**
+     *   * 计算一个.java文件中的代码行,空行,注释行
+     *   *
+     *   * @param file
+     *   * 要计算的.java文件
+     */
+    private void sumCode(File file) {
+        BufferedReader br = null;
+        boolean comment = false;
+        try {
+            br = new BufferedReader(new FileReader(file));
+            String line = "";
+            try {
+                while ((line = br.readLine()) != null) {
+                    line = line.trim();
+                    if (line.matches("^[\\s&&[^\\n]]*$")) {
+                        whiteLines++;
+                    } else if (line.startsWith("/*") && !line.endsWith("*/")) {
+                        commentLines++;
+                        comment = true;
+                    } else if (true == comment) {
+                        commentLines++;
+                        if (line.endsWith("*/")) {
+                            comment = false;
+                        }
+                    } else if (line.startsWith("//")) {
+                        commentLines++;
+                    } else {
+                        normalLines++;
+                    }
+                }
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        } catch (
+                FileNotFoundException e) {
+            e.printStackTrace();
+        } finally {
+            if (br != null) {
+                try {
+                    br.close();
+                    br = null;
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+    }
+}

+ 8 - 41
doc-biz/src/main/java/com/doc/biz/controller/ApiController.java

@@ -1,15 +1,13 @@
 package com.doc.biz.controller;
 
-import cn.hutool.http.HttpUtil;
 import com.doc.biz.domain.DocInfo;
 import com.doc.biz.domain.DocRecent;
 import com.doc.biz.service.IDocInfoService;
 import com.doc.biz.service.IDocRecentService;
 import com.doc.biz.service.IMongoService;
-import com.doc.biz.vo.DocumentVO;
+import com.doc.biz.service.IOcrService;
 import com.doc.common.core.controller.BaseController;
 import com.doc.common.core.domain.AjaxResult;
-import com.doc.system.service.ISysConfigService;
 import io.swagger.annotations.Api;
 import io.swagger.annotations.ApiOperation;
 import lombok.extern.slf4j.Slf4j;
@@ -18,10 +16,6 @@ import org.springframework.web.bind.annotation.*;
 import org.springframework.web.multipart.MultipartFile;
 
 import javax.annotation.Resource;
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.util.HashMap;
 
 /**
  * 文件上传
@@ -41,11 +35,11 @@ public class ApiController extends BaseController {
     @Resource
     private IMongoService mongoService;
     @Resource
-    private ISysConfigService configService;
-    @Resource
     private IDocRecentService recentService;
     @Resource
     private IDocInfoService docInfoService;
+    @Resource
+    private IOcrService ocrService;
 
     /**
      * 文件预览
@@ -82,18 +76,14 @@ public class ApiController extends BaseController {
      * 文字识别
      *
      * @param fileId fileId
-     * @return
+     * @return ajax
      */
     @ApiOperation("文字识别-通过fileId")
     @GetMapping("/ocr/{fileId}")
     public AjaxResult ocr(@PathVariable(name = "fileId") String fileId) {
-        DocumentVO vo = mongoService.downloadFile(fileId);
         try {
-            File upFile = File.createTempFile(vo.getFileName(), vo.getSuffix() + ".");
-            Files.write(upFile.toPath(), vo.getData());
-            String result = callPythonOcrApi(upFile);
-            return success(result);
-        } catch (IOException e) {
+            return success(ocrService.recognition(fileId));
+        } catch (Exception e) {
             return error(e.getMessage());
         }
     }
@@ -103,39 +93,16 @@ public class ApiController extends BaseController {
      * 文字识别
      *
      * @param file 文件
-     * @return
+     * @return ajax
      */
     @ApiOperation("文字识别-上传图片")
     @PostMapping("/upload")
     public AjaxResult ocr(@RequestParam(value = "file") MultipartFile file) {
         try {
-            //选择用缓冲区来实现这个转换即使用java 创建的临时文件 使用 MultipartFile.transferto()方法 。
-            String originalFilename = file.getOriginalFilename();
-            String[] filename = originalFilename.split("\\.");
-            File upFile = File.createTempFile(filename[0], filename[1] + ".");
-            file.transferTo(upFile);
-            String result = callPythonOcrApi(upFile);
-            return success(result);
+            return success(ocrService.recognition(file));
         } catch (Exception e) {
             return error(e.getMessage());
         }
     }
 
-    /**
-     * 调用python接口获取结果
-     *
-     * @param file 文件
-     * @return 识别结果
-     */
-    private String callPythonOcrApi(File file) {
-        HashMap<String, Object> map = new HashMap<>(3);
-        map.put("file", file);
-        String uri = configService.selectConfigByKey("api.ocr.uri");
-        String result = HttpUtil.post(uri, map);
-        try {
-            Files.delete(file.toPath());
-        } catch (IOException e) {
-        }
-        return result;
-    }
 }

+ 46 - 23
doc-biz/src/main/java/com/doc/biz/controller/OnlyOfficeController.java

@@ -124,7 +124,7 @@ public class OnlyOfficeController {
                     HttpEntity entity = res.getEntity();
                     try (InputStream is = entity.getContent()) {
                         multipartFile = FileUtils.getMultipartFile(is, info.getFileName());
-                        process(id, info, user, multipartFile);
+                        process(info, user, multipartFile);
                     } catch (Exception e) {
                         throw e;
                     }
@@ -132,7 +132,7 @@ public class OnlyOfficeController {
                     URL url = new URL(downloadUri);
                     HttpURLConnection connection = (java.net.HttpURLConnection) url.openConnection();
                     multipartFile = FileUtils.getMultipartFile(connection.getInputStream(), info.getFileName());
-                    process(id, info, user, multipartFile);
+                    process(info, user, multipartFile);
                     connection.disconnect();
                 }
 
@@ -141,8 +141,51 @@ public class OnlyOfficeController {
                 saved = 1;
             }
         }
-        /*插入操作日志--start*/
+
         String result = "{\"error\":" + saved + "}";
+        //操作日志
+        operLog(body, result, start);
+        writer.write(result);
+    }
+
+    /**
+     * 保存历史版本,处理新的文件信息
+     *
+     * @param info          文档信息
+     * @param user          用户信息
+     * @param multipartFile 多媒体文件
+     */
+    private void process(DocInfo info, SysUser user, MultipartFile multipartFile) {
+        try {
+            //保存版本信息
+            DocVersion version = new DocVersion();
+            version.setDocId(info.getDocId());
+            version.setFileId(info.getFileId());
+            version.setCreateBy(info.getCreateBy());
+            version.setCreateTime(info.getCreateTime());
+            versionService.insertDocVersion(version);
+
+            //保存新的文件信息
+            DocumentVO vo = mongoService.uploadFile(multipartFile);
+            info.setFileId(vo.getFileId());
+            info.setFileSize(vo.getFileSize());
+            info.setCreateBy(user.getUserName());
+            info.setCreateTime(new Date());
+            info.setUpdateBy(user.getUserName());
+            docInfoService.updateDocInfoByOnlyOffice(info);
+        } catch (Exception e) {
+            log.error("onlyoffice回调出错啦:{}", e.getMessage());
+        }
+    }
+
+    /**
+     * 插入操作日志
+     *
+     * @param body   请求参数
+     * @param result 返回结果
+     * @param start  开始处理时间
+     */
+    private void operLog(String body, String result, long start) {
         //设置操作日志
         SysOperLog log = new SysOperLog();
         // 设置action动作
@@ -162,25 +205,5 @@ public class OnlyOfficeController {
         log.setCostTime(System.currentTimeMillis() - start);
         operLogService.insertOperlog(log);
         /*插入操作日志--end*/
-        writer.write(result);
-    }
-
-    private void process(Long id, DocInfo info, SysUser user, MultipartFile multipartFile) throws Exception {
-        //保存版本信息
-        DocVersion version = new DocVersion();
-        version.setDocId(id);
-        version.setFileId(info.getFileId());
-        version.setCreateBy(info.getCreateBy());
-        version.setCreateTime(info.getCreateTime());
-        versionService.insertDocVersion(version);
-
-        //保存新的文件信息
-        DocumentVO vo = mongoService.uploadFile(multipartFile);
-        info.setFileId(vo.getFileId());
-        info.setFileSize(vo.getFileSize());
-        info.setCreateBy(user.getUserName());
-        info.setCreateTime(new Date());
-        info.setUpdateBy(user.getUserName());
-        docInfoService.updateDocInfoByOnlyOffice(info);
     }
 }

+ 1 - 0
doc-biz/src/main/java/com/doc/biz/service/IDocInfoService.java

@@ -73,4 +73,5 @@ public interface IDocInfoService {
      * @param info 文件信息
      */
     void updateDocInfoByOnlyOffice(DocInfo info);
+
 }

+ 28 - 0
doc-biz/src/main/java/com/doc/biz/service/IOcrService.java

@@ -0,0 +1,28 @@
+package com.doc.biz.service;
+
+import org.springframework.web.multipart.MultipartFile;
+
+/**
+ * OCR服务
+ *
+ * @author wukai
+ */
+public interface IOcrService {
+    /**
+     * 文字识别
+     *
+     * @param fileId 文件ID
+     * @return 识别结果
+     * @throws Exception 异常
+     */
+    String recognition(String fileId) throws Exception;
+
+    /**
+     * 文字识别
+     *
+     * @param file 文件
+     * @return 识别结果
+     * @throws Exception 异常
+     */
+    String recognition(MultipartFile file) throws Exception;
+}

+ 0 - 12
doc-biz/src/main/java/com/doc/biz/service/impl/DocInfoServiceImpl.java

@@ -62,11 +62,8 @@ public class DocInfoServiceImpl implements IDocInfoService {
      */
     @Override
     public void updateDocInfoByOnlyOffice(DocInfo info) {
-
         info.setUpdateTime(DateUtils.getNowDate());
-
         docInfoMapper.updateDocInfo(info);
-
         if (StringUtils.isNotEmpty(info.getFileId())) {
             elasticSearchService.save(info);
         }
@@ -150,17 +147,8 @@ public class DocInfoServiceImpl implements IDocInfoService {
     @Override
     public int updateDocInfo(DocInfo docInfo) {
         checkDuplicateNames(docInfo);
-
         docInfo.setUpdateTime(DateUtils.getNowDate());
-
         int i = docInfoMapper.updateDocInfo(docInfo);
-
-        if (StringUtils.isNotEmpty(docInfo.getFileId())) {
-            elasticSearchService.save(docInfo);
-        }
-        //改变空间容量
-        spaceService.updateUsedCap(docInfo.getSpaceId());
-
         return i;
     }
 

+ 90 - 22
doc-biz/src/main/java/com/doc/biz/service/impl/ElasticSearchServiceImpl.java

@@ -5,9 +5,14 @@ import com.doc.biz.domain.EsDocInfo;
 import com.doc.biz.service.IElasticSearchService;
 import com.doc.biz.service.IEsDocInfoService;
 import com.doc.biz.service.IMongoService;
+import com.doc.biz.service.IOcrService;
 import com.doc.biz.vo.DocumentVO;
 import com.doc.common.config.EsConfig;
+import com.doc.common.constant.Constants;
 import com.doc.common.utils.FileContentUtils;
+import com.doc.common.utils.StringUtils;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.scheduling.annotation.Async;
 import org.springframework.stereotype.Service;
 
 import javax.annotation.Resource;
@@ -22,6 +27,7 @@ import java.util.function.Function;
  * @date 2023-08-15
  */
 @Service
+@Slf4j
 public class ElasticSearchServiceImpl implements IElasticSearchService {
     @Resource
     private IEsDocInfoService esDocInfoService;
@@ -29,6 +35,8 @@ public class ElasticSearchServiceImpl implements IElasticSearchService {
     private EsConfig esConfig;
     @Resource
     private IMongoService mongoService;
+    @Resource
+    private IOcrService ocrService;
 
     /**
      * 文件内容入es库
@@ -36,29 +44,18 @@ public class ElasticSearchServiceImpl implements IElasticSearchService {
      * @param info 文档信息
      */
     @Override
+    @Async("threadPoolTaskExecutor")
     public void save(DocInfo info) {
         if (info.getFileSize() == 0) {
             return;
         }
-        Map<String, Function<byte[], String>> handlerMap = new HashMap<>(16);
-        handlerMap.put(".docx", FileContentUtils::getContentDocx);
-        handlerMap.put(".doc", FileContentUtils::getContentDoc);
-        handlerMap.put(".wps", FileContentUtils::getContentWps);
-        handlerMap.put(".txt", FileContentUtils::getContentTxt);
-
-        Function<byte[], String> handler = handlerMap.get(info.getFileType());
-        if (handler != null) {
-            //组装ES索引名
-            String indexName = "docs_" + info.getSpaceId();
-            //获取文件
-            DocumentVO vo = mongoService.downloadFile(info.getFileId());
-            if (vo != null) {
-                String content = handler.apply(vo.getData());
-                EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
-
-                esConfig.setIndexName(indexName);
-                esDocInfoService.save(esDocInfo);
-            }
+        String fileType = info.getFileType().toLowerCase();
+        Map<String, Function<byte[], String>> handlerMap = createHandlerMap(fileType);
+        String content = getContent(handlerMap, fileType, info.getFileId());
+        if (StringUtils.isNotEmpty(content)) {
+            comboIndex(info.getSpaceId());
+            EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
+            esDocInfoService.save(esDocInfo);
         }
     }
 
@@ -69,10 +66,81 @@ public class ElasticSearchServiceImpl implements IElasticSearchService {
      */
     @Override
     public void delete(DocInfo info) {
-        //组装ES索引名
-        String indexName = "docs_" + info.getSpaceId();
+        comboIndex(info.getSpaceId());
+        esDocInfoService.deleteById(info.getDocId());
+    }
 
+    /**
+     * 组装ES索引名
+     *
+     * @param spaceId 空间ID
+     */
+    private void comboIndex(Long spaceId) {
+        String indexName = "docs_" + spaceId;
         esConfig.setIndexName(indexName);
-        esDocInfoService.deleteById(info.getDocId());
     }
+
+    /**
+     * 创建文件访问类型MAP
+     *
+     * @param fileType 文件类型
+     * @return map
+     */
+    private Map<String, Function<byte[], String>> createHandlerMap(String fileType) {
+        Map<String, Function<byte[], String>> handlerMap = new HashMap<>(16);
+        switch (fileType) {
+            case ".docx":
+            case ".doc":
+            case ".wps":
+                //word wps文档
+                handlerMap.put(fileType, FileContentUtils::getContentDocx);
+                break;
+            case ".xls":
+            case ".et":
+                handlerMap.put(fileType, FileContentUtils::getContentXls);
+                break;
+            case ".xlsx":
+                handlerMap.put(fileType, FileContentUtils::getContentXlsx);
+                break;
+            case ".ppt":
+            case ".dps":
+                handlerMap.put(fileType, FileContentUtils::getContentPpt);
+                break;
+            case ".pptx":
+                handlerMap.put(fileType, FileContentUtils::getContentPptx);
+                break;
+            case ".pdf":
+                handlerMap.put(fileType, FileContentUtils::getContentPdf);
+                break;
+            default:
+                break;
+        }
+        return handlerMap;
+    }
+
+    /**
+     * 获取文件内容
+     *
+     * @param handlerMap map
+     * @param fileType   文件类型
+     * @param fileId     文件ID
+     * @return 解析结果
+     */
+    private String getContent(Map<String, Function<byte[], String>> handlerMap, String fileType, String fileId) {
+        Function<byte[], String> handler = handlerMap.get(fileType);
+        if (Constants.IMAGE_EXTENSION.contains(fileType)) {
+            try {
+                return ocrService.recognition(fileId);
+            } catch (Exception e) {
+                log.error("解析图片错误:{}", e.getMessage());
+            }
+        } else if (handler != null) {
+            DocumentVO vo = mongoService.downloadFile(fileId);
+            if (vo != null) {
+                return handler.apply(vo.getData());
+            }
+        }
+        return "";
+    }
+
 }

+ 77 - 0
doc-biz/src/main/java/com/doc/biz/service/impl/OcrServiceImpl.java

@@ -0,0 +1,77 @@
+package com.doc.biz.service.impl;
+
+import cn.hutool.http.HttpUtil;
+import com.doc.biz.service.IMongoService;
+import com.doc.biz.service.IOcrService;
+import com.doc.biz.vo.DocumentVO;
+import com.doc.system.service.ISysConfigService;
+import org.springframework.stereotype.Service;
+import org.springframework.web.multipart.MultipartFile;
+
+import javax.annotation.Resource;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.HashMap;
+
+/**
+ * ElasticSearch 索引接口
+ *
+ * @author wukai
+ * @date 2023-08-15
+ */
+@Service
+public class OcrServiceImpl implements IOcrService {
+    @Resource
+    private ISysConfigService configService;
+    @Resource
+    private IMongoService mongoService;
+
+    /**
+     * 调用python接口获取结果
+     *
+     * @param fileId 文件ID
+     * @return 识别结果
+     */
+    @Override
+    public String recognition(String fileId) throws Exception {
+        DocumentVO vo = mongoService.downloadFile(fileId);
+        File upFile = File.createTempFile(vo.getFileName(), vo.getSuffix() + ".");
+        Files.write(upFile.toPath(), vo.getData());
+        return callPythonOcrApi(upFile);
+    }
+
+    /**
+     * 调用python接口获取结果
+     *
+     * @param file 文件
+     * @return 识别结果
+     */
+    @Override
+    public String recognition(MultipartFile file) throws Exception {
+        //选择用缓冲区来实现这个转换即使用java 创建的临时文件 使用 MultipartFile.transferto()方法 。
+        String originalFilename = file.getOriginalFilename();
+        String[] filename = originalFilename.split("\\.");
+        File upFile = File.createTempFile(filename[0], filename[1] + ".");
+        file.transferTo(upFile);
+        return callPythonOcrApi(upFile);
+    }
+
+    /**
+     * 调用python接口获取结果
+     *
+     * @param file 文件
+     * @return 识别结果
+     */
+    private String callPythonOcrApi(File file) {
+        HashMap<String, Object> map = new HashMap<>(3);
+        map.put("file", file);
+        String uri = configService.selectConfigByKey("api.ocr.uri");
+        String result = HttpUtil.post(uri, map);
+        try {
+            Files.delete(file.toPath());
+        } catch (IOException ignored) {
+        }
+        return result;
+    }
+}

+ 82 - 102
doc-common/src/main/java/com/doc/common/utils/FileContentUtils.java

@@ -2,17 +2,17 @@ package com.doc.common.utils;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
-import org.apache.poi.xslf.usermodel.XSLFShape;
-import org.apache.poi.xslf.usermodel.XSLFSlide;
-import org.apache.poi.xslf.usermodel.XSLFTextShape;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
-import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
@@ -20,7 +20,6 @@ import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Paths;
-import java.util.List;
 
 /**
  * 读取文件内容
@@ -42,18 +41,8 @@ public class FileContentUtils {
             //根据需求入参也可以改为文件路径,对应的输入流部分改为new File(路径)即可
             // 2007版本的word
             XWPFDocument xwpf = new XWPFDocument(is);
-            // 2007版本,仅支持docx文件处理
-            List<XWPFParagraph> paragraphs = xwpf.getParagraphs();
-            StringBuilder content = new StringBuilder();
-
-            for (XWPFParagraph paragraph : paragraphs) {
-                if (!paragraph.getParagraphText().startsWith("    ")) {
-                    content.append(paragraph.getParagraphText().trim()).append("\r\n");
-                } else {
-                    content.append(paragraph.getParagraphText());
-                }
-            }
-            return content.toString();
+            XWPFWordExtractor extractor = new XWPFWordExtractor(xwpf);
+            return extractor.getText();
         } catch (Exception e) {
             e.printStackTrace();
             return "";
@@ -70,19 +59,7 @@ public class FileContentUtils {
         try (InputStream is = new ByteArrayInputStream(data)) {
             // 2003版本的word
             WordExtractor wordExtractor = new WordExtractor(is);
-            // 2003版本 仅doc格式文件可处理,docx文件不可处理
-            String[] text = wordExtractor.getParagraphText();
-            StringBuilder content = new StringBuilder();
-
-            // 获取段落,段落缩进无法获取,可以在前添加空格填充
-            for (String str : text) {
-                if (!str.startsWith("    ")) {
-                    content.append(str.trim()).append("\r\n");
-                } else {
-                    content.append(str);
-                }
-            }
-            return content.toString();
+            return wordExtractor.getText();
         } catch (IllegalArgumentException e) {
             //做兼容,因为onlyoffice在线编辑时,会将2003的文档保存为word2007的格式。
             return getContentDocx(data);
@@ -93,37 +70,6 @@ public class FileContentUtils {
     }
 
     /**
-     * 获取正文文件内容,wps方法
-     *
-     * @param data 二进制文件内容
-     * @return
-     */
-    public static String getContentWps(byte[] data) {
-        // 0表示获取正常,1表示获取异常
-        try (InputStream is = new ByteArrayInputStream(data)) {
-            // wps版本word
-            HWPFDocument hwpf = new HWPFDocument(is);
-
-            // 文档文本内容
-            String[] text = new WordExtractor(hwpf).getParagraphText();
-            StringBuilder content = new StringBuilder();
-            if (text != null && text.length > 0) {
-                for (String str : text) {
-                    if (!str.startsWith("    ")) {
-                        content.append(str.trim()).append("\r\n");
-                    } else {
-                        content.append(str);
-                    }
-                }
-            }
-            return content.toString();
-        } catch (Exception e) {
-            e.printStackTrace();
-            return "";
-        }
-    }
-
-    /**
      * 获取正文文件内容,PDF方法
      *
      * @param data 二进制文件内容
@@ -150,25 +96,17 @@ public class FileContentUtils {
     }
 
     /**
-     * 获取正文文件内容,PDF方法
+     * 获取正文文件内容,PPTX方法
      *
      * @param data 二进制文件内容
      * @return
      */
-    public static String getContentPpt(byte[] data) {
+    public static String getContentPptx(byte[] data) {
         String result = "";
         try (InputStream is = new ByteArrayInputStream(data)) {
             XMLSlideShow ppt = new XMLSlideShow(is);
-            List<XSLFSlide> slideList = ppt.getSlides();
-            for (XSLFSlide slide : slideList) {
-                List<XSLFShape> shapeList = slide.getShapes();
-                for (XSLFShape shape : shapeList) {
-                    if (shape instanceof XSLFTextShape) {
-                        XSLFTextShape textSharp = (XSLFTextShape) shape;
-                        result += textSharp.getText() + "\n";
-                    }
-                }
-            }
+            SlideShowExtractor e = new SlideShowExtractor(ppt);
+            return e.getText();
         } catch (IOException e) {
             e.printStackTrace();
         }
@@ -176,41 +114,84 @@ public class FileContentUtils {
     }
 
     /**
-     * 获取正文文件内容,PDF方法
+     * 获取正文文件内容,PPTX方法
      *
      * @param data 二进制文件内容
      * @return
      */
-    public static String getContentExcel(byte[] data) {
-        String result = "";
+    public static String getContentPpt(byte[] data) {
+        try (InputStream is = new ByteArrayInputStream(data)) {
+            HSLFSlideShow slideShow = new HSLFSlideShow(is);
+            SlideShowExtractor e = new SlideShowExtractor(slideShow);
+            return e.getText();
+        } catch (IOException e) {
+            e.printStackTrace();
+            return "";
+        }
+    }
+
+    /**
+     * 获取正文文件内容,XLSX方法
+     *
+     * @param data 二进制文件内容
+     * @return 结果
+     */
+    public static String getContentXlsx(byte[] data) {
         try (InputStream is = new ByteArrayInputStream(data);
              XSSFWorkbook xssfWorkbook = new XSSFWorkbook(is)) {
             //创建工作簿对象
-            //获取工作簿下sheet的个数
-            int sheetNum = xssfWorkbook.getNumberOfSheets();
-            //遍历工作簿中的所有数据
-            for (int i = 0; i < sheetNum; i++) {
-                //读取第i个工作表
-                XSSFSheet sheet = xssfWorkbook.getSheetAt(i);
-                //获取最后一行的num,即总行数。此处从0开始
-                int maxRow = sheet.getLastRowNum();
-                for (int rowNum = 0; rowNum <= maxRow; rowNum++) {
-                    Row row = sheet.getRow(rowNum);
-                    if (row != null) {
-                        //获取最后单元格num,即总单元格数 ***注意:此处从1开始计数***
-                        int maxRol = row.getLastCellNum();
-                        for (int cellNum = 0; cellNum < maxRol; cellNum++) {
-                            result += row.getCell(cellNum) + "  ";
-                        }
-                        result += "\n";
-                    }
-                }
-            }
+            return getSheetText(xssfWorkbook);
+        } catch (IOException e) {
+            e.printStackTrace();
+            return "";
+        }
+    }
 
+    /**
+     * 获取正文文件内容,xls
+     *
+     * @param data 二进制文件内容
+     * @return
+     */
+    public static String getContentXls(byte[] data) {
+        try (InputStream is = new ByteArrayInputStream(data);
+             HSSFWorkbook hssfWorkbook = new HSSFWorkbook(is)) {
+            //创建工作簿对象
+            return getSheetText(hssfWorkbook);
         } catch (IOException e) {
             e.printStackTrace();
+            return "";
         }
+    }
 
+    /**
+     * 获取工作表内容
+     *
+     * @param workbook 工作簿
+     * @return txt
+     */
+    private static String getSheetText(Workbook workbook) {
+        String result = "";
+        //获取工作簿下sheet的个数
+        int sheetNum = workbook.getNumberOfSheets();
+        //遍历工作簿中的所有数据
+        for (int i = 0; i < sheetNum; i++) {
+            //读取第i个工作表
+            Sheet sheet = workbook.getSheetAt(i);
+            //获取最后一行的num,即总行数。此处从0开始
+            int maxRow = sheet.getLastRowNum();
+            for (int rowNum = 0; rowNum <= maxRow; rowNum++) {
+                Row row = sheet.getRow(rowNum);
+                if (row != null) {
+                    //获取最后单元格num,即总单元格数 ***注意:此处从1开始计数***
+                    int maxRol = row.getLastCellNum();
+                    for (int cellNum = 0; cellNum < maxRol; cellNum++) {
+                        result += row.getCell(cellNum) + "  ";
+                    }
+                    result += "\n";
+                }
+            }
+        }
         return result.replace("\n\n", "\n");
     }
 
@@ -230,11 +211,10 @@ public class FileContentUtils {
     }
 
     public static void main(String[] args) throws Exception {
-        String path = "D:\\SYSTEM\\Desktop\\temp\\parse\\test.xlsx";
-
-        byte[] bytes = Files.readAllBytes(Paths.get(path));
+        String path = "D:\\SYSTEM\\Desktop\\temp\\parse\\test.docx";
 
-        String result = getContentExcel(bytes);
+        byte[] data = Files.readAllBytes(Paths.get(path));
+        String result = getContentDoc(data);
         System.err.println(result);
     }
 }

+ 17 - 14
doc-framework/src/main/java/com/doc/framework/config/ThreadPoolConfig.java

@@ -16,23 +16,29 @@ import java.util.concurrent.ThreadPoolExecutor;
  * @author ruoyi
  **/
 @Configuration
-public class ThreadPoolConfig
-{
-    // 核心线程池大小
+public class ThreadPoolConfig {
+    /**
+     * 核心线程池大小
+     */
     private int corePoolSize = 50;
 
-    // 最大可创建的线程数
+    /**
+     * 最大可创建的线程数
+     */
     private int maxPoolSize = 200;
 
-    // 队列最大长度
+    /**
+     * 队列最大长度
+     */
     private int queueCapacity = 1000;
 
-    // 线程池维护线程所允许的空闲时间
+    /**
+     * 线程池维护线程所允许的空闲时间
+     */
     private int keepAliveSeconds = 300;
 
     @Bean(name = "threadPoolTaskExecutor")
-    public ThreadPoolTaskExecutor threadPoolTaskExecutor()
-    {
+    public ThreadPoolTaskExecutor threadPoolTaskExecutor() {
         ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
         executor.setMaxPoolSize(maxPoolSize);
         executor.setCorePoolSize(corePoolSize);
@@ -47,15 +53,12 @@ public class ThreadPoolConfig
      * 执行周期性或定时任务
      */
     @Bean(name = "scheduledExecutorService")
-    protected ScheduledExecutorService scheduledExecutorService()
-    {
+    protected ScheduledExecutorService scheduledExecutorService() {
         return new ScheduledThreadPoolExecutor(corePoolSize,
                 new BasicThreadFactory.Builder().namingPattern("schedule-pool-%d").daemon(true).build(),
-                new ThreadPoolExecutor.CallerRunsPolicy())
-        {
+                new ThreadPoolExecutor.CallerRunsPolicy()) {
             @Override
-            protected void afterExecute(Runnable r, Throwable t)
-            {
+            protected void afterExecute(Runnable r, Throwable t) {
                 super.afterExecute(r, t);
                 Threads.printException(r, t);
             }

+ 2 - 2
sql/view.sql

@@ -13,11 +13,11 @@ SELECT DIR_ID,
        UPDATE_BY,
        UPDATE_TIME,
        REMARK,
-       (WITH RECURSIVE VDIR AS (SELECT DIR_ID, DIR_NAME, PARENT_ID, DIR_NAME AS DPATH
+       (WITH RECURSIVE VDIR AS (SELECT DIR_ID, DIR_NAME, PARENT_ID, CAST(DIR_NAME AS CHAR(4000)) AS DPATH
                                 FROM DOC_DIR
                                 WHERE DIR_ID = A.DIR_ID
                                 UNION ALL
-                                SELECT C.DIR_ID, C.DIR_NAME, C.PARENT_ID, CONCAT(C.DIR_NAME, '/', P.DPATH) AS DPATH
+                                SELECT C.DIR_ID, C.DIR_NAME, C.PARENT_ID, CONCAT(C.DIR_NAME, '/', P.DPATH)ATH
                                 FROM DOC_DIR C
                                          JOIN VDIR P ON C.DIR_ID = P.PARENT_ID)
         SELECT DPATH