| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269 |
- package com.doc.biz.service.impl;
- import com.doc.biz.domain.DocIndex;
- import com.doc.biz.domain.DocInfo;
- import com.doc.biz.domain.EsDocInfo;
- import com.doc.biz.service.*;
- import com.doc.biz.vo.DocumentVO;
- import com.doc.common.config.EsConfig;
- import com.doc.common.constant.Constants;
- import com.doc.common.utils.FileContentUtils;
- import com.doc.common.utils.StringUtils;
- import com.doc.common.utils.file.FileUtils;
- import com.doc.common.utils.file.PdfUtils;
- import lombok.extern.slf4j.Slf4j;
- import org.springframework.data.elasticsearch.NoSuchIndexException;
- import org.springframework.scheduling.annotation.Async;
- import org.springframework.stereotype.Service;
- import org.springframework.web.multipart.MultipartFile;
- import javax.annotation.Resource;
- import java.io.File;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.Optional;
- import java.util.function.Function;
- /**
- * ElasticSearch 索引接口
- *
- * @author wukai
- * @date 2023-08-15
- */
- @Service
- @Slf4j
- public class ElasticSearchServiceImpl implements IElasticSearchService {
- @Resource
- private IEsDocInfoService esDocInfoService;
- @Resource
- private EsConfig esConfig;
- @Resource
- private IMongoService mongoService;
- @Resource
- private IOcrService ocrService;
- @Resource
- private IDocInfoService docInfoService;
- @Resource
- private IDocIndexService indexService;
- /**
- * 文件内容入es库
- *
- * @param info 文档信息
- */
- @Override
- @Async("threadPoolTaskExecutor")
- public void save(DocInfo info) {
- String fileType = info.getFileType().toLowerCase();
- if (Constants.IMAGE_EXTENSION.contains(fileType)) {
- DocIndex di = new DocIndex(info);
- indexService.insertDocIndex(di);
- return;
- }
- Map<String, Function<byte[], String>> handlerMap = createHandlerMap(fileType);
- String content = getContent(handlerMap, fileType, info.getFileId());
- // if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) {
- // //如果是PDF未解析成功,则另外进行解析工作
- // //TODO 暂时取消该功能
- //// DocIndex di = new DocIndex(info);
- //// indexService.insertDocIndex(di);
- // return;
- // }
- if (StringUtils.isNotEmpty(content)) {
- comboIndex(info.getSpaceId());
- try {
- EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
- esDocInfoService.save(esDocInfo);
- } catch (Exception e) {
- log.error("解析文件错误???{}", e.getMessage());
- }
- }
- }
- /**
- * 文件内容入es库
- *
- * @param info 文档信息
- */
- @Override
- @Async("threadPoolTaskExecutor")
- public void taskSave(DocInfo info) {
- String fileType = info.getFileType().toLowerCase();
- String content = "";
- if (Constants.IMAGE_EXTENSION.contains(fileType)) {
- content = getContent(fileType, info.getFileId());
- } else {
- Map<String, Function<byte[], String>> handlerMap = createHandlerMap(fileType);
- content = getContent(handlerMap, fileType, info.getFileId());
- if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) {
- //如果是PDF未解析成功,则另外进行解析工作
- content = getContent(fileType, info.getFileId());
- }
- }
- if (StringUtils.isNotEmpty(content)) {
- comboIndex(info.getSpaceId());
- EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
- esDocInfoService.save(esDocInfo);
- }
- //执行完,删除索引任务
- indexService.deleteDocIndexByDocId(info.getDocId());
- }
- /**
- * 通过ID删除ES内容
- *
- * @param info 文档信息
- */
- @Override
- public void delete(DocInfo info) {
- comboIndex(info.getSpaceId());
- //删除ES记录
- try {
- esDocInfoService.deleteById(info.getDocId());
- } catch (NoSuchIndexException e) {
- //不用管,表示没这个索引
- }
- }
- /**
- * 通过ID删除ES内容
- *
- * @param docId id
- */
- @Override
- public void delete(Long docId) {
- DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId);
- delete(docInfo);
- }
- /**
- * 根据docId 查询文本内容
- *
- * @param docId id
- * @return 文本
- */
- @Override
- public EsDocInfo getEsDocInfo(Long docId) {
- DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId);
- comboIndex(docInfo.getSpaceId());
- Optional<EsDocInfo> optional = esDocInfoService.findById(docId);
- if (optional.isPresent()) {
- EsDocInfo info = optional.get();
- info.setDocInfo(docInfo);
- return info;
- }
- return null;
- }
- /**
- * 组装ES索引名
- *
- * @param spaceId 空间ID
- */
- private void comboIndex(Long spaceId) {
- String indexName = "docs_" + spaceId;
- esConfig.setIndexName(indexName);
- }
- /**
- * 创建文件访问类型MAP
- *
- * @param fileType 文件类型
- * @return map
- */
- private Map<String, Function<byte[], String>> createHandlerMap(String fileType) {
- Map<String, Function<byte[], String>> handlerMap = new HashMap<>(16);
- switch (fileType) {
- case ".docx":
- case ".doc":
- case ".wps":
- //word wps文档
- handlerMap.put(fileType, FileContentUtils::getContentDoc);
- break;
- case ".xls":
- case ".et":
- handlerMap.put(fileType, FileContentUtils::getContentXls);
- break;
- case ".xlsx":
- handlerMap.put(fileType, FileContentUtils::getContentXlsx);
- break;
- case ".ppt":
- case ".dps":
- handlerMap.put(fileType, FileContentUtils::getContentPpt);
- break;
- case ".pptx":
- handlerMap.put(fileType, FileContentUtils::getContentPptx);
- break;
- case ".pdf":
- handlerMap.put(fileType, FileContentUtils::getContentPdf);
- break;
- case ".txt":
- handlerMap.put(fileType, FileContentUtils::getContentTxt);
- break;
- default:
- break;
- }
- return handlerMap;
- }
- /**
- * 获取文件内容
- *
- * @param handlerMap map
- * @param fileType 文件类型
- * @param fileId 文件ID
- * @return 解析结果
- */
- private String getContent(Map<String, Function<byte[], String>> handlerMap, String fileType, String fileId) {
- Function<byte[], String> handler = handlerMap.get(fileType);
- if (handler != null) {
- DocumentVO vo = mongoService.downloadFile(fileId);
- if (vo != null) {
- return handler.apply(vo.getData());
- }
- }
- return "";
- }
- /**
- * 获取文件内容
- *
- * @param fileType 文件类型
- * @param fileId 文件ID
- * @return 解析结果
- */
- private String getContent(String fileType, String fileId) {
- if (Constants.IMAGE_EXTENSION.contains(fileType)) {
- try {
- return ocrService.recognition(fileId);
- } catch (Exception e) {
- log.error("解析图片错误:{}", e.getMessage());
- }
- }
- String pdf = ".pdf";
- if (pdf.equals(fileType)) {
- DocumentVO vo = mongoService.downloadFile(fileId);
- StringBuffer sb = new StringBuffer();
- //PDF图片解析
- List<File> list = PdfUtils.pdfToImage(vo.getData());
- list.forEach(file -> {
- MultipartFile f = FileUtils.getMultipartFile(file, file.getName());
- try {
- sb.append(ocrService.recognition(f));
- } catch (Exception e) {
- log.error("解析图片错误:{}", e.getMessage());
- }
- });
- return sb.toString();
- }
- return "";
- }
- }
|