wukai před 2 roky
rodič
revize
b213121f02

+ 1 - 1
doc-admin/src/main/resources/application-aliyun.yml

@@ -1,4 +1,4 @@
-# 开发环境配置
+  # 开发环境配置
 server:
   # 服务器的HTTP端口,默认为8080
   port: 8080

+ 21 - 0
doc-admin/src/test/java/Test.java

@@ -0,0 +1,21 @@
+/**
+ * Test$
+ *
+ * @author wukai
+ * @date 2023/11/14 15:13
+ */
+public class Test {
+    public static void main(String[] args) {
+        try {
+            String cmd = " /usr/local/mysql/bin/mysqldump -h192.168.1.28 -P3306 -uroot -p123456 --default-character-set=utf8 doc >/tmp/test.sql";
+            Process exec = Runtime.getRuntime().exec(cmd);
+            if (exec.waitFor() == 0) {
+                System.out.println("数据库备份成功");
+            } else {
+                System.out.println("process.waitFor()=" + exec.waitFor());
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+}

+ 11 - 1
doc-common/pom.xml

@@ -147,7 +147,17 @@
             <artifactId>hutool-all</artifactId>
             <version>5.8.18</version>
         </dependency>
-
+        <!-- PDF操作 -->
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>2.9.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers-standard-package</artifactId>
+            <version>2.9.0</version>
+        </dependency>
         <!-- POI-word文件处理需要 -->
         <dependency>
             <groupId>org.apache.poi</groupId>

+ 54 - 0
doc-common/src/main/java/com/doc/common/utils/PPT.java

@@ -0,0 +1,54 @@
+package com.doc.common.utils;
+
+import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hslf.usermodel.HSLFSlide;
+import org.apache.poi.sl.usermodel.Slide;
+import org.apache.poi.sl.usermodel.SlideShow;
+import org.apache.poi.sl.usermodel.TextRun;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+public class PPT {
+    //直接抽取幻灯片的全部内容
+    public static String readDoc1(InputStream is) throws IOException {
+        PowerPointExtractor extractor = new PowerPointExtractor(is);
+        return extractor.getText();
+    }
+
+    //一张幻灯片一张幻灯片地读取
+    public static void readDoc2(InputStream is) throws IOException {
+        SlideShow ss = new XMLSlideShow(is);
+        List<Slide> slides = ss.getSlides();
+        for (Slide slide : slides) {
+            //读取一张幻灯片的标题
+            String title = slide.getTitle();
+            System.out.println("标题:" + title);
+//            //读取一张幻灯片的内容(包括标题)
+
+            List<TextRun> runs = slide.getComments();
+            for (TextRun run:runs) {
+                System.out.println(run.getRawText());
+            }
+        }
+    }
+
+    public static void main(String[] args) {
+        File file = new File("D:\\SYSTEM\\Desktop\\temp\\测试ppt.pptx");
+        try (FileInputStream is = new FileInputStream(file)){
+            SlideShow ppt=new XMLSlideShow(is);
+
+            List<HSLFSlide> slides = ppt.getSlides();
+            for (HSLFSlide slide : slides) {
+                System.err.println(slide.getTitle());
+                System.err.println(slide.getComments());
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+}

+ 153 - 0
doc-common/src/main/java/com/doc/common/utils/ReadContentHandler.java

@@ -0,0 +1,153 @@
+package com.doc.common.utils;
+
+import com.alibaba.fastjson2.JSONArray;
+import com.alibaba.fastjson2.JSONObject;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * @Author xiaobai
+ * @Date 2023/7/12 14:04
+ * @Title: PageContentHandler
+ * @Package com.xiaobai.util
+ * @description:
+ */
+public class ReadContentHandler extends ToXMLContentHandler {
+    private String pageTag = "div";
+    private String pageClass = "page";
+    private int pageNumber = 0;
+
+    private Map<Integer, StringBuilder> pageMap;
+
+    public ReadContentHandler() {
+        super();
+        pageMap = new HashMap<>();
+    }
+
+    private void startPage() {
+        pageNumber++;
+        pageMap.put(pageNumber, new StringBuilder());
+    }
+
+    private void endPage() {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        if (pageTag.equals(qName) && pageClass.equals(atts.getValue("class"))) {
+            startPage();
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if (pageTag.equals(qName)) {
+            endPage();
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (length > 0 && pageNumber > 0) {
+            if (ch.length == 1 && ch[0] == '\n') {
+                return;
+            }
+            pageMap.get(pageNumber).append(ch);
+//            pageMap.get(pageNumber).append('\n');
+        }
+    }
+
+    /**
+     * 文件基本信息
+     *
+     * @param file
+     * @return
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    public static Metadata fileData(File file) throws IOException, SAXException, TikaException {
+        FileInputStream input = new FileInputStream(file);//可以写文件路径,pdf,word,html等
+        BodyContentHandler textHandler = new BodyContentHandler();//获取内容
+        Metadata matadata = new Metadata();//Metadata对象保存了作者,标题等元数据
+        AutoDetectParser parser = new AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParser
+        ParseContext context = new ParseContext();
+        parser.parse(input, textHandler, matadata, context);//执行解析过程
+        input.close();
+
+        return matadata;
+    }
+
+    /**
+     * 读取文件内容
+     *
+     * @param file 支持txt/word/excle/pdf等多种格式
+     * @return
+     * @throws TikaException
+     * @throws IOException
+     */
+    public static String parseText(File file) throws TikaException, IOException {
+        Tika tika = new Tika();
+        return tika.parseToString(file);
+    }
+
+
+    /**
+     * 按页读取文件内容
+     *
+     * @param file 仅支持pdf
+     * @return
+     * @throws TikaException
+     * @throws IOException
+     */
+    public static JSONArray parsePageToPdf(File file) throws Exception {
+        JSONArray jsonArray = new JSONArray();
+        JSONObject jsonObject = null;
+
+        ReadContentHandler handler = new ReadContentHandler();
+        Metadata metadata = new Metadata();
+
+        FileInputStream inputstream = new FileInputStream(file);
+        ParseContext pcontext = new ParseContext();
+
+        //parsing the document using PDF parser
+        PDFParser pdfparser = new PDFParser();
+        pdfparser.parse(inputstream, handler, metadata, pcontext);
+
+        //getting the content of the document by pages.
+        for (Map.Entry<Integer, StringBuilder> entry : handler.pageMap.entrySet()) {
+            jsonObject = new JSONObject();
+            jsonObject.put("page", entry.getKey());
+            jsonObject.put("content", entry.getValue().toString());
+            jsonArray.add(jsonObject);
+        }
+
+        return jsonArray;
+    }
+
+
+    public static void main(String[] args) throws Exception {
+        String path = "D:\\SYSTEM\\Desktop\\temp\\SpringBoot 整合 TrueLicense 实现 License 的授权与服务器许可1.pdf";
+        File f = new File(path);
+        JSONArray json = parsePageToPdf(f);
+
+        System.err.println(json);
+//        String txt = parseText(f);
+//        System.err.println(txt);
+    }
+
+}