ElasticSearchServiceImpl.java 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. package com.doc.biz.service.impl;
  2. import com.doc.biz.domain.DocIndex;
  3. import com.doc.biz.domain.DocInfo;
  4. import com.doc.biz.domain.EsDocInfo;
  5. import com.doc.biz.service.*;
  6. import com.doc.biz.vo.DocumentVO;
  7. import com.doc.common.config.EsConfig;
  8. import com.doc.common.constant.Constants;
  9. import com.doc.common.utils.FileContentUtils;
  10. import com.doc.common.utils.StringUtils;
  11. import com.doc.common.utils.file.FileUtils;
  12. import com.doc.common.utils.file.PdfUtils;
  13. import lombok.extern.slf4j.Slf4j;
  14. import org.springframework.data.elasticsearch.NoSuchIndexException;
  15. import org.springframework.scheduling.annotation.Async;
  16. import org.springframework.stereotype.Service;
  17. import org.springframework.web.multipart.MultipartFile;
  18. import javax.annotation.Resource;
  19. import java.io.File;
  20. import java.util.HashMap;
  21. import java.util.List;
  22. import java.util.Map;
  23. import java.util.Optional;
  24. import java.util.function.Function;
  25. /**
  26. * ElasticSearch 索引接口
  27. *
  28. * @author wukai
  29. * @date 2023-08-15
  30. */
  31. @Service
  32. @Slf4j
  33. public class ElasticSearchServiceImpl implements IElasticSearchService {
  34. @Resource
  35. private IEsDocInfoService esDocInfoService;
  36. @Resource
  37. private EsConfig esConfig;
  38. @Resource
  39. private IMongoService mongoService;
  40. @Resource
  41. private IOcrService ocrService;
  42. @Resource
  43. private IDocInfoService docInfoService;
  44. @Resource
  45. private IDocIndexService indexService;
  46. /**
  47. * 文件内容入es库
  48. *
  49. * @param info 文档信息
  50. */
  51. @Override
  52. @Async("threadPoolTaskExecutor")
  53. public void save(DocInfo info) {
  54. String fileType = info.getFileType().toLowerCase();
  55. if (Constants.IMAGE_EXTENSION.contains(fileType)) {
  56. DocIndex di = new DocIndex(info);
  57. indexService.insertDocIndex(di);
  58. return;
  59. }
  60. Map<String, Function<byte[], String>> handlerMap = createHandlerMap(fileType);
  61. String content = getContent(handlerMap, fileType, info.getFileId());
  62. // if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) {
  63. // //如果是PDF未解析成功,则另外进行解析工作
  64. // //TODO 暂时取消该功能
  65. //// DocIndex di = new DocIndex(info);
  66. //// indexService.insertDocIndex(di);
  67. // return;
  68. // }
  69. if (StringUtils.isNotEmpty(content)) {
  70. comboIndex(info.getSpaceId());
  71. try {
  72. EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
  73. esDocInfoService.save(esDocInfo);
  74. } catch (Exception e) {
  75. log.error("解析文件错误???{}", e.getMessage());
  76. }
  77. }
  78. }
  79. /**
  80. * 文件内容入es库
  81. *
  82. * @param info 文档信息
  83. */
  84. @Override
  85. @Async("threadPoolTaskExecutor")
  86. public void taskSave(DocInfo info) {
  87. String fileType = info.getFileType().toLowerCase();
  88. String content = "";
  89. if (Constants.IMAGE_EXTENSION.contains(fileType)) {
  90. content = getContent(fileType, info.getFileId());
  91. } else {
  92. Map<String, Function<byte[], String>> handlerMap = createHandlerMap(fileType);
  93. content = getContent(handlerMap, fileType, info.getFileId());
  94. if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) {
  95. //如果是PDF未解析成功,则另外进行解析工作
  96. content = getContent(fileType, info.getFileId());
  97. }
  98. }
  99. if (StringUtils.isNotEmpty(content)) {
  100. comboIndex(info.getSpaceId());
  101. EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
  102. esDocInfoService.save(esDocInfo);
  103. }
  104. //执行完,删除索引任务
  105. indexService.deleteDocIndexByDocId(info.getDocId());
  106. }
  107. /**
  108. * 通过ID删除ES内容
  109. *
  110. * @param info 文档信息
  111. */
  112. @Override
  113. public void delete(DocInfo info) {
  114. comboIndex(info.getSpaceId());
  115. //删除ES记录
  116. try {
  117. esDocInfoService.deleteById(info.getDocId());
  118. } catch (NoSuchIndexException e) {
  119. //不用管,表示没这个索引
  120. }
  121. }
  122. /**
  123. * 通过ID删除ES内容
  124. *
  125. * @param docId id
  126. */
  127. @Override
  128. public void delete(Long docId) {
  129. DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId);
  130. delete(docInfo);
  131. }
  132. /**
  133. * 根据docId 查询文本内容
  134. *
  135. * @param docId id
  136. * @return 文本
  137. */
  138. @Override
  139. public EsDocInfo getEsDocInfo(Long docId) {
  140. DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId);
  141. comboIndex(docInfo.getSpaceId());
  142. Optional<EsDocInfo> optional = esDocInfoService.findById(docId);
  143. if (optional.isPresent()) {
  144. EsDocInfo info = optional.get();
  145. info.setDocInfo(docInfo);
  146. return info;
  147. }
  148. return null;
  149. }
  150. /**
  151. * 组装ES索引名
  152. *
  153. * @param spaceId 空间ID
  154. */
  155. private void comboIndex(Long spaceId) {
  156. String indexName = "docs_" + spaceId;
  157. esConfig.setIndexName(indexName);
  158. }
  159. /**
  160. * 创建文件访问类型MAP
  161. *
  162. * @param fileType 文件类型
  163. * @return map
  164. */
  165. private Map<String, Function<byte[], String>> createHandlerMap(String fileType) {
  166. Map<String, Function<byte[], String>> handlerMap = new HashMap<>(16);
  167. switch (fileType) {
  168. case ".docx":
  169. case ".doc":
  170. case ".wps":
  171. //word wps文档
  172. handlerMap.put(fileType, FileContentUtils::getContentDoc);
  173. break;
  174. case ".xls":
  175. case ".et":
  176. handlerMap.put(fileType, FileContentUtils::getContentXls);
  177. break;
  178. case ".xlsx":
  179. handlerMap.put(fileType, FileContentUtils::getContentXlsx);
  180. break;
  181. case ".ppt":
  182. case ".dps":
  183. handlerMap.put(fileType, FileContentUtils::getContentPpt);
  184. break;
  185. case ".pptx":
  186. handlerMap.put(fileType, FileContentUtils::getContentPptx);
  187. break;
  188. case ".pdf":
  189. handlerMap.put(fileType, FileContentUtils::getContentPdf);
  190. break;
  191. case ".txt":
  192. handlerMap.put(fileType, FileContentUtils::getContentTxt);
  193. break;
  194. default:
  195. break;
  196. }
  197. return handlerMap;
  198. }
  199. /**
  200. * 获取文件内容
  201. *
  202. * @param handlerMap map
  203. * @param fileType 文件类型
  204. * @param fileId 文件ID
  205. * @return 解析结果
  206. */
  207. private String getContent(Map<String, Function<byte[], String>> handlerMap, String fileType, String fileId) {
  208. Function<byte[], String> handler = handlerMap.get(fileType);
  209. if (handler != null) {
  210. DocumentVO vo = mongoService.downloadFile(fileId);
  211. if (vo != null) {
  212. return handler.apply(vo.getData());
  213. }
  214. }
  215. return "";
  216. }
  217. /**
  218. * 获取文件内容
  219. *
  220. * @param fileType 文件类型
  221. * @param fileId 文件ID
  222. * @return 解析结果
  223. */
  224. private String getContent(String fileType, String fileId) {
  225. if (Constants.IMAGE_EXTENSION.contains(fileType)) {
  226. try {
  227. return ocrService.recognition(fileId);
  228. } catch (Exception e) {
  229. log.error("解析图片错误:{}", e.getMessage());
  230. }
  231. }
  232. String pdf = ".pdf";
  233. if (pdf.equals(fileType)) {
  234. DocumentVO vo = mongoService.downloadFile(fileId);
  235. StringBuffer sb = new StringBuffer();
  236. //PDF图片解析
  237. List<File> list = PdfUtils.pdfToImage(vo.getData());
  238. list.forEach(file -> {
  239. MultipartFile f = FileUtils.getMultipartFile(file, file.getName());
  240. try {
  241. sb.append(ocrService.recognition(f));
  242. } catch (Exception e) {
  243. log.error("解析图片错误:{}", e.getMessage());
  244. }
  245. });
  246. return sb.toString();
  247. }
  248. return "";
  249. }
  250. }