|
|
@@ -0,0 +1,153 @@
|
|
|
+package com.doc.common.utils;
|
|
|
+
|
|
|
+import com.alibaba.fastjson2.JSONArray;
|
|
|
+import com.alibaba.fastjson2.JSONObject;
|
|
|
+import org.apache.tika.Tika;
|
|
|
+import org.apache.tika.exception.TikaException;
|
|
|
+import org.apache.tika.metadata.Metadata;
|
|
|
+import org.apache.tika.parser.AutoDetectParser;
|
|
|
+import org.apache.tika.parser.ParseContext;
|
|
|
+import org.apache.tika.parser.pdf.PDFParser;
|
|
|
+import org.apache.tika.sax.BodyContentHandler;
|
|
|
+import org.apache.tika.sax.ToXMLContentHandler;
|
|
|
+import org.xml.sax.Attributes;
|
|
|
+import org.xml.sax.SAXException;
|
|
|
+
|
|
|
+import java.io.File;
|
|
|
+import java.io.FileInputStream;
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.HashMap;
|
|
|
+import java.util.Map;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @Author xiaobai
|
|
|
+ * @Date 2023/7/12 14:04
|
|
|
+ * @Title: PageContentHandler
|
|
|
+ * @Package com.xiaobai.util
|
|
|
+ * @description:
|
|
|
+ */
|
|
|
+public class ReadContentHandler extends ToXMLContentHandler {
|
|
|
+ private String pageTag = "div";
|
|
|
+ private String pageClass = "page";
|
|
|
+ private int pageNumber = 0;
|
|
|
+
|
|
|
+ private Map<Integer, StringBuilder> pageMap;
|
|
|
+
|
|
|
+ public ReadContentHandler() {
|
|
|
+ super();
|
|
|
+ pageMap = new HashMap<>();
|
|
|
+ }
|
|
|
+
|
|
|
+ private void startPage() {
|
|
|
+ pageNumber++;
|
|
|
+ pageMap.put(pageNumber, new StringBuilder());
|
|
|
+ }
|
|
|
+
|
|
|
+ private void endPage() {
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
|
|
|
+ if (pageTag.equals(qName) && pageClass.equals(atts.getValue("class"))) {
|
|
|
+ startPage();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void endElement(String uri, String localName, String qName) throws SAXException {
|
|
|
+ if (pageTag.equals(qName)) {
|
|
|
+ endPage();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void characters(char[] ch, int start, int length) throws SAXException {
|
|
|
+ if (length > 0 && pageNumber > 0) {
|
|
|
+ if (ch.length == 1 && ch[0] == '\n') {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ pageMap.get(pageNumber).append(ch);
|
|
|
+// pageMap.get(pageNumber).append('\n');
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 文件基本信息
|
|
|
+ *
|
|
|
+ * @param file
|
|
|
+ * @return
|
|
|
+ * @throws IOException
|
|
|
+ * @throws SAXException
|
|
|
+ * @throws TikaException
|
|
|
+ */
|
|
|
+ public static Metadata fileData(File file) throws IOException, SAXException, TikaException {
|
|
|
+ FileInputStream input = new FileInputStream(file);//可以写文件路径,pdf,word,html等
|
|
|
+ BodyContentHandler textHandler = new BodyContentHandler();//获取内容
|
|
|
+ Metadata matadata = new Metadata();//Metadata对象保存了作者,标题等元数据
|
|
|
+ AutoDetectParser parser = new AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParser
|
|
|
+ ParseContext context = new ParseContext();
|
|
|
+ parser.parse(input, textHandler, matadata, context);//执行解析过程
|
|
|
+ input.close();
|
|
|
+
|
|
|
+ return matadata;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 读取文件内容
|
|
|
+ *
|
|
|
+ * @param file 支持txt/word/excle/pdf等多种格式
|
|
|
+ * @return
|
|
|
+ * @throws TikaException
|
|
|
+ * @throws IOException
|
|
|
+ */
|
|
|
+ public static String parseText(File file) throws TikaException, IOException {
|
|
|
+ Tika tika = new Tika();
|
|
|
+ return tika.parseToString(file);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 按页读取文件内容
|
|
|
+ *
|
|
|
+ * @param file 仅支持pdf
|
|
|
+ * @return
|
|
|
+ * @throws TikaException
|
|
|
+ * @throws IOException
|
|
|
+ */
|
|
|
+ public static JSONArray parsePageToPdf(File file) throws Exception {
|
|
|
+ JSONArray jsonArray = new JSONArray();
|
|
|
+ JSONObject jsonObject = null;
|
|
|
+
|
|
|
+ ReadContentHandler handler = new ReadContentHandler();
|
|
|
+ Metadata metadata = new Metadata();
|
|
|
+
|
|
|
+ FileInputStream inputstream = new FileInputStream(file);
|
|
|
+ ParseContext pcontext = new ParseContext();
|
|
|
+
|
|
|
+ //parsing the document using PDF parser
|
|
|
+ PDFParser pdfparser = new PDFParser();
|
|
|
+ pdfparser.parse(inputstream, handler, metadata, pcontext);
|
|
|
+
|
|
|
+ //getting the content of the document by pages.
|
|
|
+ for (Map.Entry<Integer, StringBuilder> entry : handler.pageMap.entrySet()) {
|
|
|
+ jsonObject = new JSONObject();
|
|
|
+ jsonObject.put("page", entry.getKey());
|
|
|
+ jsonObject.put("content", entry.getValue().toString());
|
|
|
+ jsonArray.add(jsonObject);
|
|
|
+ }
|
|
|
+
|
|
|
+ return jsonArray;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ public static void main(String[] args) throws Exception {
|
|
|
+ String path = "D:\\SYSTEM\\Desktop\\temp\\SpringBoot 整合 TrueLicense 实现 License 的授权与服务器许可1.pdf";
|
|
|
+ File f = new File(path);
|
|
|
+ JSONArray json = parsePageToPdf(f);
|
|
|
+
|
|
|
+ System.err.println(json);
|
|
|
+// String txt = parseText(f);
|
|
|
+// System.err.println(txt);
|
|
|
+ }
|
|
|
+
|
|
|
+}
|