Prechádzať zdrojové kódy

读取PDF和PPT文字内容

wukai 1 rok pred
rodič
commit
3d282568df

+ 3 - 8
doc-common/pom.xml

@@ -149,14 +149,9 @@
         </dependency>
         <!-- PDF操作 -->
         <dependency>
-            <groupId>org.apache.tika</groupId>
-            <artifactId>tika-core</artifactId>
-            <version>2.9.0</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.tika</groupId>
-            <artifactId>tika-parsers-standard-package</artifactId>
-            <version>2.9.0</version>
+            <groupId>org.apache.pdfbox</groupId>
+            <artifactId>pdfbox-tools</artifactId>
+            <version>2.0.27</version>
         </dependency>
         <!-- POI-word文件处理需要 -->
         <dependency>

+ 62 - 0
doc-common/src/main/java/com/doc/common/utils/FileContentUtils.java

@@ -1,11 +1,18 @@
 package com.doc.common.utils;
 
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFShape;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFTextShape;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 
 import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
@@ -111,6 +118,58 @@ public class FileContentUtils {
         }
     }
 
+    /**
+     * 获取正文文件内容,PDF方法
+     *
+     * @param data 二进制文件内容
+     * @return
+     */
+    public static String getContentPdf(byte[] data) {
+        String result = "";
+        try {
+            PDDocument load = PDDocument.load(data);
+            PDFTextStripper stripper = new PDFTextStripper();
+            stripper.setSortByPosition(true);
+            for (int i = 1; i < load.getNumberOfPages() + 1; i++) {
+                //读取pdf的开始到结束页
+                stripper.setStartPage(i);
+                stripper.setEndPage(i);
+                String text = stripper.getText(load);
+                //拼接不同页数的数据返回
+                result += text;
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return result;
+    }
+
+    /**
+     * 获取正文文件内容,PDF方法
+     *
+     * @param data 二进制文件内容
+     * @return
+     */
+    public static String getContentPpt(byte[] data) {
+        String result = "";
+        try (InputStream is = new ByteArrayInputStream(data)) {
+            XMLSlideShow ppt = new XMLSlideShow(is);
+            List<XSLFSlide> slideList = ppt.getSlides();
+            for (XSLFSlide slide : slideList) {
+                List<XSLFShape> shapeList = slide.getShapes();
+                for (XSLFShape shape : shapeList) {
+                    if (shape instanceof XSLFTextShape) {
+                        XSLFTextShape textSharp = (XSLFTextShape) shape;
+                        result += textSharp.getText() + "\n";
+                    }
+                }
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return result.replace("\n\n", "\n");
+    }
+
     public static String getContentTxt(byte[] data) {
         return new String(data, StandardCharsets.UTF_8).trim();
 //        try (InputStream is = new ByteArrayInputStream(data); BufferedReader br = new BufferedReader(data)) {
@@ -125,4 +184,7 @@ public class FileContentUtils {
 //            return "";
 //        }
     }
+
+    public static void main(String[] args) {
+    }
 }

+ 0 - 54
doc-common/src/main/java/com/doc/common/utils/PPT.java

@@ -1,54 +0,0 @@
-package com.doc.common.utils;
-
-import org.apache.poi.hslf.extractor.PowerPointExtractor;
-import org.apache.poi.hslf.usermodel.HSLFSlide;
-import org.apache.poi.sl.usermodel.Slide;
-import org.apache.poi.sl.usermodel.SlideShow;
-import org.apache.poi.sl.usermodel.TextRun;
-import org.apache.poi.xslf.usermodel.XMLSlideShow;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.List;
-
-public class PPT {
-    //直接抽取幻灯片的全部内容
-    public static String readDoc1(InputStream is) throws IOException {
-        PowerPointExtractor extractor = new PowerPointExtractor(is);
-        return extractor.getText();
-    }
-
-    //一张幻灯片一张幻灯片地读取
-    public static void readDoc2(InputStream is) throws IOException {
-        SlideShow ss = new XMLSlideShow(is);
-        List<Slide> slides = ss.getSlides();
-        for (Slide slide : slides) {
-            //读取一张幻灯片的标题
-            String title = slide.getTitle();
-            System.out.println("标题:" + title);
-//            //读取一张幻灯片的内容(包括标题)
-
-            List<TextRun> runs = slide.getComments();
-            for (TextRun run:runs) {
-                System.out.println(run.getRawText());
-            }
-        }
-    }
-
-    public static void main(String[] args) {
-        File file = new File("D:\\SYSTEM\\Desktop\\temp\\测试ppt.pptx");
-        try (FileInputStream is = new FileInputStream(file)){
-            SlideShow ppt=new XMLSlideShow(is);
-
-            List<HSLFSlide> slides = ppt.getSlides();
-            for (HSLFSlide slide : slides) {
-                System.err.println(slide.getTitle());
-                System.err.println(slide.getComments());
-            }
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-    }
-}

+ 0 - 153
doc-common/src/main/java/com/doc/common/utils/ReadContentHandler.java

@@ -1,153 +0,0 @@
-package com.doc.common.utils;
-
-import com.alibaba.fastjson2.JSONArray;
-import com.alibaba.fastjson2.JSONObject;
-import org.apache.tika.Tika;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.pdf.PDFParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * @Author xiaobai
- * @Date 2023/7/12 14:04
- * @Title: PageContentHandler
- * @Package com.xiaobai.util
- * @description:
- */
-public class ReadContentHandler extends ToXMLContentHandler {
-    private String pageTag = "div";
-    private String pageClass = "page";
-    private int pageNumber = 0;
-
-    private Map<Integer, StringBuilder> pageMap;
-
-    public ReadContentHandler() {
-        super();
-        pageMap = new HashMap<>();
-    }
-
-    private void startPage() {
-        pageNumber++;
-        pageMap.put(pageNumber, new StringBuilder());
-    }
-
-    private void endPage() {
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        if (pageTag.equals(qName) && pageClass.equals(atts.getValue("class"))) {
-            startPage();
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        if (pageTag.equals(qName)) {
-            endPage();
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        if (length > 0 && pageNumber > 0) {
-            if (ch.length == 1 && ch[0] == '\n') {
-                return;
-            }
-            pageMap.get(pageNumber).append(ch);
-//            pageMap.get(pageNumber).append('\n');
-        }
-    }
-
-    /**
-     * 文件基本信息
-     *
-     * @param file
-     * @return
-     * @throws IOException
-     * @throws SAXException
-     * @throws TikaException
-     */
-    public static Metadata fileData(File file) throws IOException, SAXException, TikaException {
-        FileInputStream input = new FileInputStream(file);//可以写文件路径,pdf,word,html等
-        BodyContentHandler textHandler = new BodyContentHandler();//获取内容
-        Metadata matadata = new Metadata();//Metadata对象保存了作者,标题等元数据
-        AutoDetectParser parser = new AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParser
-        ParseContext context = new ParseContext();
-        parser.parse(input, textHandler, matadata, context);//执行解析过程
-        input.close();
-
-        return matadata;
-    }
-
-    /**
-     * 读取文件内容
-     *
-     * @param file 支持txt/word/excle/pdf等多种格式
-     * @return
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static String parseText(File file) throws TikaException, IOException {
-        Tika tika = new Tika();
-        return tika.parseToString(file);
-    }
-
-
-    /**
-     * 按页读取文件内容
-     *
-     * @param file 仅支持pdf
-     * @return
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static JSONArray parsePageToPdf(File file) throws Exception {
-        JSONArray jsonArray = new JSONArray();
-        JSONObject jsonObject = null;
-
-        ReadContentHandler handler = new ReadContentHandler();
-        Metadata metadata = new Metadata();
-
-        FileInputStream inputstream = new FileInputStream(file);
-        ParseContext pcontext = new ParseContext();
-
-        //parsing the document using PDF parser
-        PDFParser pdfparser = new PDFParser();
-        pdfparser.parse(inputstream, handler, metadata, pcontext);
-
-        //getting the content of the document by pages.
-        for (Map.Entry<Integer, StringBuilder> entry : handler.pageMap.entrySet()) {
-            jsonObject = new JSONObject();
-            jsonObject.put("page", entry.getKey());
-            jsonObject.put("content", entry.getValue().toString());
-            jsonArray.add(jsonObject);
-        }
-
-        return jsonArray;
-    }
-
-
-    public static void main(String[] args) throws Exception {
-        String path = "D:\\SYSTEM\\Desktop\\temp\\SpringBoot 整合 TrueLicense 实现 License 的授权与服务器许可1.pdf";
-        File f = new File(path);
-        JSONArray json = parsePageToPdf(f);
-
-        System.err.println(json);
-//        String txt = parseText(f);
-//        System.err.println(txt);
-    }
-
-}