package com.doc.biz.service.impl; import com.doc.biz.domain.DocIndex; import com.doc.biz.domain.DocInfo; import com.doc.biz.domain.EsDocInfo; import com.doc.biz.service.*; import com.doc.biz.vo.DocumentVO; import com.doc.common.config.EsConfig; import com.doc.common.constant.Constants; import com.doc.common.utils.FileContentUtils; import com.doc.common.utils.StringUtils; import com.doc.common.utils.file.FileUtils; import com.doc.common.utils.file.PdfUtils; import lombok.extern.slf4j.Slf4j; import org.springframework.data.elasticsearch.NoSuchIndexException; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; import javax.annotation.Resource; import java.io.File; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.function.Function; /** * ElasticSearch 索引接口 * * @author wukai * @date 2023-08-15 */ @Service @Slf4j public class ElasticSearchServiceImpl implements IElasticSearchService { @Resource private IEsDocInfoService esDocInfoService; @Resource private EsConfig esConfig; @Resource private IMongoService mongoService; @Resource private IOcrService ocrService; @Resource private IDocInfoService docInfoService; @Resource private IDocIndexService indexService; /** * 文件内容入es库 * * @param info 文档信息 */ @Override @Async("threadPoolTaskExecutor") public void save(DocInfo info) { String fileType = info.getFileType().toLowerCase(); if (Constants.IMAGE_EXTENSION.contains(fileType)) { DocIndex di = new DocIndex(info); indexService.insertDocIndex(di); return; } Map> handlerMap = createHandlerMap(fileType); String content = getContent(handlerMap, fileType, info.getFileId()); // if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) { // //如果是PDF未解析成功,则另外进行解析工作 // //TODO 暂时取消该功能 //// DocIndex di = new DocIndex(info); //// indexService.insertDocIndex(di); // return; // } if (StringUtils.isNotEmpty(content)) { comboIndex(info.getSpaceId()); try { EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content); esDocInfoService.save(esDocInfo); } catch (Exception e) { log.error("解析文件错误???{}", e.getMessage()); } } } /** * 文件内容入es库 * * @param info 文档信息 */ @Override @Async("threadPoolTaskExecutor") public void taskSave(DocInfo info) { String fileType = info.getFileType().toLowerCase(); String content = ""; if (Constants.IMAGE_EXTENSION.contains(fileType)) { content = getContent(fileType, info.getFileId()); } else { Map> handlerMap = createHandlerMap(fileType); content = getContent(handlerMap, fileType, info.getFileId()); if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) { //如果是PDF未解析成功,则另外进行解析工作 content = getContent(fileType, info.getFileId()); } } if (StringUtils.isNotEmpty(content)) { comboIndex(info.getSpaceId()); EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content); esDocInfoService.save(esDocInfo); } //执行完,删除索引任务 indexService.deleteDocIndexByDocId(info.getDocId()); } /** * 通过ID删除ES内容 * * @param info 文档信息 */ @Override public void delete(DocInfo info) { comboIndex(info.getSpaceId()); //删除ES记录 try { esDocInfoService.deleteById(info.getDocId()); } catch (NoSuchIndexException e) { //不用管,表示没这个索引 } } /** * 通过ID删除ES内容 * * @param docId id */ @Override public void delete(Long docId) { DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId); delete(docInfo); } /** * 根据docId 查询文本内容 * * @param docId id * @return 文本 */ @Override public EsDocInfo getEsDocInfo(Long docId) { DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId); comboIndex(docInfo.getSpaceId()); Optional optional = esDocInfoService.findById(docId); if (optional.isPresent()) { EsDocInfo info = optional.get(); info.setDocInfo(docInfo); return info; } return null; } /** * 组装ES索引名 * * @param spaceId 空间ID */ private void comboIndex(Long spaceId) { String indexName = "docs_" + spaceId; esConfig.setIndexName(indexName); } /** * 创建文件访问类型MAP * * @param fileType 文件类型 * @return map */ private Map> createHandlerMap(String fileType) { Map> handlerMap = new HashMap<>(16); switch (fileType) { case ".docx": case ".doc": case ".wps": //word wps文档 handlerMap.put(fileType, FileContentUtils::getContentDoc); break; case ".xls": case ".et": handlerMap.put(fileType, FileContentUtils::getContentXls); break; case ".xlsx": handlerMap.put(fileType, FileContentUtils::getContentXlsx); break; case ".ppt": case ".dps": handlerMap.put(fileType, FileContentUtils::getContentPpt); break; case ".pptx": handlerMap.put(fileType, FileContentUtils::getContentPptx); break; case ".pdf": handlerMap.put(fileType, FileContentUtils::getContentPdf); break; case ".txt": handlerMap.put(fileType, FileContentUtils::getContentTxt); break; default: break; } return handlerMap; } /** * 获取文件内容 * * @param handlerMap map * @param fileType 文件类型 * @param fileId 文件ID * @return 解析结果 */ private String getContent(Map> handlerMap, String fileType, String fileId) { Function handler = handlerMap.get(fileType); if (handler != null) { DocumentVO vo = mongoService.downloadFile(fileId); if (vo != null) { return handler.apply(vo.getData()); } } return ""; } /** * 获取文件内容 * * @param fileType 文件类型 * @param fileId 文件ID * @return 解析结果 */ private String getContent(String fileType, String fileId) { if (Constants.IMAGE_EXTENSION.contains(fileType)) { try { return ocrService.recognition(fileId); } catch (Exception e) { log.error("解析图片错误:{}", e.getMessage()); } } String pdf = ".pdf"; if (pdf.equals(fileType)) { DocumentVO vo = mongoService.downloadFile(fileId); StringBuffer sb = new StringBuffer(); //PDF图片解析 List list = PdfUtils.pdfToImage(vo.getData()); list.forEach(file -> { MultipartFile f = FileUtils.getMultipartFile(file, file.getName()); try { sb.append(ocrService.recognition(f)); } catch (Exception e) { log.error("解析图片错误:{}", e.getMessage()); } }); return sb.toString(); } return ""; } }