houwenfeng
/
cloudfile


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
							package com.doc.biz.service.impl;

import com.doc.biz.domain.DocIndex;
import com.doc.biz.domain.DocInfo;
import com.doc.biz.domain.EsDocInfo;
import com.doc.biz.service.*;
import com.doc.biz.vo.DocumentVO;
import com.doc.common.config.EsConfig;
import com.doc.common.constant.Constants;
import com.doc.common.utils.FileContentUtils;
import com.doc.common.utils.StringUtils;
import com.doc.common.utils.file.FileUtils;
import com.doc.common.utils.file.PdfUtils;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.elasticsearch.NoSuchIndexException;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import javax.annotation.Resource;
import java.io.File;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;

/**
 * ElasticSearch 索引接口
 *
 * @author wukai
 * @date 2023-08-15
 */
@Service
@Slf4j
public class ElasticSearchServiceImpl implements IElasticSearchService {
    @Resource
    private IEsDocInfoService esDocInfoService;
    @Resource
    private EsConfig esConfig;
    @Resource
    private IMongoService mongoService;
    @Resource
    private IOcrService ocrService;
    @Resource
    private IDocInfoService docInfoService;
    @Resource
    private IDocIndexService indexService;

    /**
     * 文件内容入es库
     *
     * @param info 文档信息
     */
    @Override
    @Async("threadPoolTaskExecutor")
    public void save(DocInfo info) {
        String fileType = info.getFileType().toLowerCase();

        if (Constants.IMAGE_EXTENSION.contains(fileType)) {
            DocIndex di = new DocIndex(info);
            indexService.insertDocIndex(di);
            return;
        }

        Map<String, Function<byte[], String>> handlerMap = createHandlerMap(fileType);
        String content = getContent(handlerMap, fileType, info.getFileId());

//        if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) {
//            //如果是PDF未解析成功，则另外进行解析工作
//            //TODO 暂时取消该功能
////            DocIndex di = new DocIndex(info);
////            indexService.insertDocIndex(di);
//            return;
//        }

        if (StringUtils.isNotEmpty(content)) {
            comboIndex(info.getSpaceId());
            try {
                EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
                esDocInfoService.save(esDocInfo);
            } catch (Exception e) {
                log.error("解析文件错误？？？{}", e.getMessage());
            }
        }
    }

    /**
     * 文件内容入es库
     *
     * @param info 文档信息
     */
    @Override
    @Async("threadPoolTaskExecutor")
    public void taskSave(DocInfo info) {
        String fileType = info.getFileType().toLowerCase();
        String content = "";
        if (Constants.IMAGE_EXTENSION.contains(fileType)) {
            content = getContent(fileType, info.getFileId());
        } else {
            Map<String, Function<byte[], String>> handlerMap = createHandlerMap(fileType);
            content = getContent(handlerMap, fileType, info.getFileId());
            if (Constants.PDF_EXTENSION.contains(fileType) && StringUtils.isEmpty(content)) {
                //如果是PDF未解析成功，则另外进行解析工作
                content = getContent(fileType, info.getFileId());
            }

        }

        if (StringUtils.isNotEmpty(content)) {
            comboIndex(info.getSpaceId());
            EsDocInfo esDocInfo = new EsDocInfo(info.getDocId(), content);
            esDocInfoService.save(esDocInfo);
        }
        //执行完，删除索引任务
        indexService.deleteDocIndexByDocId(info.getDocId());
    }

    /**
     * 通过ID删除ES内容
     *
     * @param info 文档信息
     */
    @Override
    public void delete(DocInfo info) {
        comboIndex(info.getSpaceId());
        //删除ES记录
        try {
            esDocInfoService.deleteById(info.getDocId());
        } catch (NoSuchIndexException e) {
            //不用管，表示没这个索引
        }
    }

    /**
     * 通过ID删除ES内容
     *
     * @param docId id
     */
    @Override
    public void delete(Long docId) {
        DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId);
        delete(docInfo);
    }

    /**
     * 根据docId 查询文本内容
     *
     * @param docId id
     * @return 文本
     */
    @Override
    public EsDocInfo getEsDocInfo(Long docId) {
        DocInfo docInfo = docInfoService.selectDocInfoByDocId(docId);
        comboIndex(docInfo.getSpaceId());
        Optional<EsDocInfo> optional = esDocInfoService.findById(docId);
        if (optional.isPresent()) {
            EsDocInfo info = optional.get();
            info.setDocInfo(docInfo);
            return info;
        }
        return null;
    }

    /**
     * 组装ES索引名
     *
     * @param spaceId 空间ID
     */
    private void comboIndex(Long spaceId) {
        String indexName = "docs_" + spaceId;
        esConfig.setIndexName(indexName);
    }

    /**
     * 创建文件访问类型MAP
     *
     * @param fileType 文件类型
     * @return map
     */
    private Map<String, Function<byte[], String>> createHandlerMap(String fileType) {
        Map<String, Function<byte[], String>> handlerMap = new HashMap<>(16);
        switch (fileType) {
            case ".docx":
            case ".doc":
            case ".wps":
                //word wps文档
                handlerMap.put(fileType, FileContentUtils::getContentDoc);
                break;
            case ".xls":
            case ".et":
                handlerMap.put(fileType, FileContentUtils::getContentXls);
                break;
            case ".xlsx":
                handlerMap.put(fileType, FileContentUtils::getContentXlsx);
                break;
            case ".ppt":
            case ".dps":
                handlerMap.put(fileType, FileContentUtils::getContentPpt);
                break;
            case ".pptx":
                handlerMap.put(fileType, FileContentUtils::getContentPptx);
                break;
            case ".pdf":
                handlerMap.put(fileType, FileContentUtils::getContentPdf);
                break;
            case ".txt":
                handlerMap.put(fileType, FileContentUtils::getContentTxt);
                break;
            default:
                break;
        }
        return handlerMap;
    }

    /**
     * 获取文件内容
     *
     * @param handlerMap map
     * @param fileType   文件类型
     * @param fileId     文件ID
     * @return 解析结果
     */
    private String getContent(Map<String, Function<byte[], String>> handlerMap, String fileType, String fileId) {
        Function<byte[], String> handler = handlerMap.get(fileType);
        if (handler != null) {
            DocumentVO vo = mongoService.downloadFile(fileId);
            if (vo != null) {
                return handler.apply(vo.getData());
            }
        }
        return "";
    }

    /**
     * 获取文件内容
     *
     * @param fileType 文件类型
     * @param fileId   文件ID
     * @return 解析结果
     */
    private String getContent(String fileType, String fileId) {
        if (Constants.IMAGE_EXTENSION.contains(fileType)) {
            try {
                return ocrService.recognition(fileId);
            } catch (Exception e) {
                log.error("解析图片错误：{}", e.getMessage());
            }
        }
        String pdf = ".pdf";
        if (pdf.equals(fileType)) {
            DocumentVO vo = mongoService.downloadFile(fileId);
            StringBuffer sb = new StringBuffer();
            //PDF图片解析
            List<File> list = PdfUtils.pdfToImage(vo.getData());
            list.forEach(file -> {
                MultipartFile f = FileUtils.getMultipartFile(file, file.getName());
                try {
                    sb.append(ocrService.recognition(f));
                } catch (Exception e) {
                    log.error("解析图片错误：{}", e.getMessage());
                }
            });
            return sb.toString();
        }
        return "";
    }
}