|
|
@@ -2,17 +2,17 @@ package com.doc.common.utils;
|
|
|
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
-import org.apache.poi.hwpf.HWPFDocument;
|
|
|
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
|
|
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
|
|
+import org.apache.poi.sl.extractor.SlideShowExtractor;
|
|
|
import org.apache.poi.ss.usermodel.Row;
|
|
|
+import org.apache.poi.ss.usermodel.Sheet;
|
|
|
+import org.apache.poi.ss.usermodel.Workbook;
|
|
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
|
|
-import org.apache.poi.xslf.usermodel.XSLFShape;
|
|
|
-import org.apache.poi.xslf.usermodel.XSLFSlide;
|
|
|
-import org.apache.poi.xslf.usermodel.XSLFTextShape;
|
|
|
-import org.apache.poi.xssf.usermodel.XSSFSheet;
|
|
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|
|
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
|
-import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
import java.io.IOException;
|
|
|
@@ -20,7 +20,6 @@ import java.io.InputStream;
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
import java.nio.file.Files;
|
|
|
import java.nio.file.Paths;
|
|
|
-import java.util.List;
|
|
|
|
|
|
/**
|
|
|
* 读取文件内容
|
|
|
@@ -42,18 +41,8 @@ public class FileContentUtils {
|
|
|
//根据需求入参也可以改为文件路径,对应的输入流部分改为new File(路径)即可
|
|
|
// 2007版本的word
|
|
|
XWPFDocument xwpf = new XWPFDocument(is);
|
|
|
- // 2007版本,仅支持docx文件处理
|
|
|
- List<XWPFParagraph> paragraphs = xwpf.getParagraphs();
|
|
|
- StringBuilder content = new StringBuilder();
|
|
|
-
|
|
|
- for (XWPFParagraph paragraph : paragraphs) {
|
|
|
- if (!paragraph.getParagraphText().startsWith(" ")) {
|
|
|
- content.append(paragraph.getParagraphText().trim()).append("\r\n");
|
|
|
- } else {
|
|
|
- content.append(paragraph.getParagraphText());
|
|
|
- }
|
|
|
- }
|
|
|
- return content.toString();
|
|
|
+ XWPFWordExtractor extractor = new XWPFWordExtractor(xwpf);
|
|
|
+ return extractor.getText();
|
|
|
} catch (Exception e) {
|
|
|
e.printStackTrace();
|
|
|
return "";
|
|
|
@@ -70,19 +59,7 @@ public class FileContentUtils {
|
|
|
try (InputStream is = new ByteArrayInputStream(data)) {
|
|
|
// 2003版本的word
|
|
|
WordExtractor wordExtractor = new WordExtractor(is);
|
|
|
- // 2003版本 仅doc格式文件可处理,docx文件不可处理
|
|
|
- String[] text = wordExtractor.getParagraphText();
|
|
|
- StringBuilder content = new StringBuilder();
|
|
|
-
|
|
|
- // 获取段落,段落缩进无法获取,可以在前添加空格填充
|
|
|
- for (String str : text) {
|
|
|
- if (!str.startsWith(" ")) {
|
|
|
- content.append(str.trim()).append("\r\n");
|
|
|
- } else {
|
|
|
- content.append(str);
|
|
|
- }
|
|
|
- }
|
|
|
- return content.toString();
|
|
|
+ return wordExtractor.getText();
|
|
|
} catch (IllegalArgumentException e) {
|
|
|
//做兼容,因为onlyoffice在线编辑时,会将2003的文档保存为word2007的格式。
|
|
|
return getContentDocx(data);
|
|
|
@@ -93,37 +70,6 @@ public class FileContentUtils {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * 获取正文文件内容,wps方法
|
|
|
- *
|
|
|
- * @param data 二进制文件内容
|
|
|
- * @return
|
|
|
- */
|
|
|
- public static String getContentWps(byte[] data) {
|
|
|
- // 0表示获取正常,1表示获取异常
|
|
|
- try (InputStream is = new ByteArrayInputStream(data)) {
|
|
|
- // wps版本word
|
|
|
- HWPFDocument hwpf = new HWPFDocument(is);
|
|
|
-
|
|
|
- // 文档文本内容
|
|
|
- String[] text = new WordExtractor(hwpf).getParagraphText();
|
|
|
- StringBuilder content = new StringBuilder();
|
|
|
- if (text != null && text.length > 0) {
|
|
|
- for (String str : text) {
|
|
|
- if (!str.startsWith(" ")) {
|
|
|
- content.append(str.trim()).append("\r\n");
|
|
|
- } else {
|
|
|
- content.append(str);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- return content.toString();
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
- return "";
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
* 获取正文文件内容,PDF方法
|
|
|
*
|
|
|
* @param data 二进制文件内容
|
|
|
@@ -150,25 +96,17 @@ public class FileContentUtils {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * 获取正文文件内容,PDF方法
|
|
|
+ * 获取正文文件内容,PPTX方法
|
|
|
*
|
|
|
* @param data 二进制文件内容
|
|
|
* @return
|
|
|
*/
|
|
|
- public static String getContentPpt(byte[] data) {
|
|
|
+ public static String getContentPptx(byte[] data) {
|
|
|
String result = "";
|
|
|
try (InputStream is = new ByteArrayInputStream(data)) {
|
|
|
XMLSlideShow ppt = new XMLSlideShow(is);
|
|
|
- List<XSLFSlide> slideList = ppt.getSlides();
|
|
|
- for (XSLFSlide slide : slideList) {
|
|
|
- List<XSLFShape> shapeList = slide.getShapes();
|
|
|
- for (XSLFShape shape : shapeList) {
|
|
|
- if (shape instanceof XSLFTextShape) {
|
|
|
- XSLFTextShape textSharp = (XSLFTextShape) shape;
|
|
|
- result += textSharp.getText() + "\n";
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ SlideShowExtractor e = new SlideShowExtractor(ppt);
|
|
|
+ return e.getText();
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
}
|
|
|
@@ -176,41 +114,84 @@ public class FileContentUtils {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * 获取正文文件内容,PDF方法
|
|
|
+ * 获取正文文件内容,PPTX方法
|
|
|
*
|
|
|
* @param data 二进制文件内容
|
|
|
* @return
|
|
|
*/
|
|
|
- public static String getContentExcel(byte[] data) {
|
|
|
- String result = "";
|
|
|
+ public static String getContentPpt(byte[] data) {
|
|
|
+ try (InputStream is = new ByteArrayInputStream(data)) {
|
|
|
+ HSLFSlideShow slideShow = new HSLFSlideShow(is);
|
|
|
+ SlideShowExtractor e = new SlideShowExtractor(slideShow);
|
|
|
+ return e.getText();
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 获取正文文件内容,XLSX方法
|
|
|
+ *
|
|
|
+ * @param data 二进制文件内容
|
|
|
+ * @return 结果
|
|
|
+ */
|
|
|
+ public static String getContentXlsx(byte[] data) {
|
|
|
try (InputStream is = new ByteArrayInputStream(data);
|
|
|
XSSFWorkbook xssfWorkbook = new XSSFWorkbook(is)) {
|
|
|
//创建工作簿对象
|
|
|
- //获取工作簿下sheet的个数
|
|
|
- int sheetNum = xssfWorkbook.getNumberOfSheets();
|
|
|
- //遍历工作簿中的所有数据
|
|
|
- for (int i = 0; i < sheetNum; i++) {
|
|
|
- //读取第i个工作表
|
|
|
- XSSFSheet sheet = xssfWorkbook.getSheetAt(i);
|
|
|
- //获取最后一行的num,即总行数。此处从0开始
|
|
|
- int maxRow = sheet.getLastRowNum();
|
|
|
- for (int rowNum = 0; rowNum <= maxRow; rowNum++) {
|
|
|
- Row row = sheet.getRow(rowNum);
|
|
|
- if (row != null) {
|
|
|
- //获取最后单元格num,即总单元格数 ***注意:此处从1开始计数***
|
|
|
- int maxRol = row.getLastCellNum();
|
|
|
- for (int cellNum = 0; cellNum < maxRol; cellNum++) {
|
|
|
- result += row.getCell(cellNum) + " ";
|
|
|
- }
|
|
|
- result += "\n";
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ return getSheetText(xssfWorkbook);
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
+ /**
|
|
|
+ * 获取正文文件内容,xls
|
|
|
+ *
|
|
|
+ * @param data 二进制文件内容
|
|
|
+ * @return
|
|
|
+ */
|
|
|
+ public static String getContentXls(byte[] data) {
|
|
|
+ try (InputStream is = new ByteArrayInputStream(data);
|
|
|
+ HSSFWorkbook hssfWorkbook = new HSSFWorkbook(is)) {
|
|
|
+ //创建工作簿对象
|
|
|
+ return getSheetText(hssfWorkbook);
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
+ return "";
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
+ /**
|
|
|
+ * 获取工作表内容
|
|
|
+ *
|
|
|
+ * @param workbook 工作簿
|
|
|
+ * @return txt
|
|
|
+ */
|
|
|
+ private static String getSheetText(Workbook workbook) {
|
|
|
+ String result = "";
|
|
|
+ //获取工作簿下sheet的个数
|
|
|
+ int sheetNum = workbook.getNumberOfSheets();
|
|
|
+ //遍历工作簿中的所有数据
|
|
|
+ for (int i = 0; i < sheetNum; i++) {
|
|
|
+ //读取第i个工作表
|
|
|
+ Sheet sheet = workbook.getSheetAt(i);
|
|
|
+ //获取最后一行的num,即总行数。此处从0开始
|
|
|
+ int maxRow = sheet.getLastRowNum();
|
|
|
+ for (int rowNum = 0; rowNum <= maxRow; rowNum++) {
|
|
|
+ Row row = sheet.getRow(rowNum);
|
|
|
+ if (row != null) {
|
|
|
+ //获取最后单元格num,即总单元格数 ***注意:此处从1开始计数***
|
|
|
+ int maxRol = row.getLastCellNum();
|
|
|
+ for (int cellNum = 0; cellNum < maxRol; cellNum++) {
|
|
|
+ result += row.getCell(cellNum) + " ";
|
|
|
+ }
|
|
|
+ result += "\n";
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
return result.replace("\n\n", "\n");
|
|
|
}
|
|
|
|
|
|
@@ -230,11 +211,10 @@ public class FileContentUtils {
|
|
|
}
|
|
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
- String path = "D:\\SYSTEM\\Desktop\\temp\\parse\\test.xlsx";
|
|
|
-
|
|
|
- byte[] bytes = Files.readAllBytes(Paths.get(path));
|
|
|
+ String path = "D:\\SYSTEM\\Desktop\\temp\\parse\\test.docx";
|
|
|
|
|
|
- String result = getContentExcel(bytes);
|
|
|
+ byte[] data = Files.readAllBytes(Paths.get(path));
|
|
|
+ String result = getContentDoc(data);
|
|
|
System.err.println(result);
|
|
|
}
|
|
|
}
|