java处理office文档与pdf文件(二)

2013/08/22 问题一箩筐

该部分主要内容:文件上传,以及office文件和pdf的html处理,以及提取text

// 根据服务器的文件保存地址和原文件名创建目录文件全路径
		File file = this.getFile();
		String url = "";
		String tempFile = "";
		String fileFolder = "";	//上传文件路径
		String hz = "";
		String oldOrgFileId = null;
		Long oldId = knowledge.getZsk_zskID();
		if(null != oldId && 0 != oldId){
			oldOrgFileId = knowledge.getOrgFileId();
		}
		
		if(null != file){
			// 截取扩展名
			hz = fileFileName.substring(fileFileName.lastIndexOf("."),fileFileName.length());
			String zskCode = knowledge.getZsk_Code();
			fileFolder = createNewFile(this.savePath,zskCode);
			// 上传的文件在服务器中的全路径
			url = fileFolder + "\\" + fileFileName;
			//1、文件上传
			FileUtils.copyFile(file, new File(url));	
			
			//2、文件转化为html
			tempFile = createNewFile(this.tempPath,zskCode);
			String htmlStr = "";
			if(hz.equals(".pdf")){
				htmlStr = "<html><body>" +
				"<embed src='"+fileFileName+"' width='100%' height='100%'></embed>" +
				"</body></html>";
			}else{
				String dstHtml = tempFile+"\\"+zskCode+".html";
				//删除文件夹下所有文件及子文件夹
				FileUtil.deleteChildFile(new File(tempFile));
				
				changeDocToHtml(hz, url, dstHtml);
				htmlStr = FileUtil.htmlToStr(dstHtml);
			}
			knowledge.setContentHtml(htmlStr);
			Clob htmlColb=Hibernate.createClob(htmlStr);
			knowledge.setZsk_Description(htmlColb);
			
			//3、获取上传文件对应的文本内容
			String docContent = findDocContent(hz, url);
			knowledge.setContentText(docContent);
			Clob docContentClob=Hibernate.createClob(docContent);
			knowledge.setZsk_Text(docContentClob);
			
			String orgFileId = new GUID().toString();	//知识库原文件对应的标识
			knowledge.setOrgFileId(orgFileId);
			knowledge.setZsk_ContentType(1);
		}else{
			Clob htmlColb = Hibernate.createClob(htmlArea);
			Clob textClob = Hibernate.createClob(htmlArea.replaceAll("</?[^>]+>", ""));
			knowledge.setZsk_Description(htmlColb);
			knowledge.setContentHtml(htmlArea);
			knowledge.setZsk_Text(textClob);
			knowledge.setContentText(htmlArea);
			knowledge.setZsk_ContentType(2);
		}
		
		//添加时处理
		if(null == oldId || 0 ==  oldId){
			//to--do  需要在后期重新处理 当前用户
			if(null == knowledge.getZsk_Author() || "".equals(knowledge.getZsk_Author())){	//当前用户
				knowledge.setZsk_Author(SessionUtil.getTSysAgent().getCagentname());
			}
			knowledge.setZsk_RegisterTime(new Date());
		}
		//to---do 
		knowledge.setZsk_LastMender(1L);
		knowledge.setZsk_ModifyTime(new Date());
		
		KnowLedgeOtherContion ko = new KnowLedgeOtherContion();
		ko.setFileContentType(fileContentType);
		ko.setFileFileName(fileFileName);
		ko.setOldId(oldId);
		ko.setTempFile(tempFile);
		ko.setUrl(url);
		ko.setOldOrgFileId(oldOrgFileId);
		
		knowUploadServiceImp.saveOrUpdateKnowledge(knowledge,ko);

将office转化为html

/**
	 * 将word,excel,ppt,pdf转化为html
	 * @param hz
	 * @param url
	 * @param dstHtml
	 */
	private void changeDocToHtml(String hz, String url, String dstHtml) {
		if("pdf".equalsIgnoreCase(hz)){
			
		}else if(".xls".equalsIgnoreCase(hz) || ".xlsx".equalsIgnoreCase(hz)){
			DocToHtml.getInstance().ExceltoHtml(url,dstHtml);
		}else if(".doc".equalsIgnoreCase(hz) || ".docx".equalsIgnoreCase(hz)){
			DocToHtml.getInstance().WordtoHtml(url,dstHtml);
		}else if(".ppt".equalsIgnoreCase(hz) || ".pptx".equalsIgnoreCase(hz)){
			DocToHtml.getInstance().PPTtoHtml(url, dstHtml);
		}
	}

将word,wxcel,ppt另存为html的方法

public boolean WordtoHtml(String srcFile, String dstFile) {
		ComThread.InitSTA();
		ActiveXComponent activexcomponent = new ActiveXComponent("Word.Application");
		String s2 = srcFile;
		String s3 = dstFile;
		boolean flag = false;
		try {
			activexcomponent.setProperty("Visible", new Variant(false));
			Dispatch dispatch = activexcomponent.getProperty("Documents").toDispatch();
			Dispatch dispatch1 = Dispatch.invoke(dispatch, "Open", 1,
					new Object[] { s2, new Variant(false), new Variant(true) },
					new int[1]).toDispatch();
			Dispatch.invoke(dispatch1, "SaveAs", 1, new Object[] { s3,new Variant(8) }, new int[1]);
			Variant variant = new Variant(false);
			Dispatch.call(dispatch1, "Close", variant);
			flag = true;
		} catch (Exception exception) {
			log.error("word转化为html出错-->"+exception.getMessage());
		} finally {
			activexcomponent.invoke("Quit", new Variant[0]);
			ComThread.Release();
			ComThread.quitMainSTA();
		}
		return flag;
	}

	public boolean PPTtoHtml(String srcFile, String dstFile) {
		ComThread.InitSTA();
		ActiveXComponent activexcomponent = new ActiveXComponent( "PowerPoint.Application");
		boolean flag = false;
		try {
			Dispatch dispatch = activexcomponent.getProperty("Presentations")
					.toDispatch();
			Dispatch dispatch1 = Dispatch.call(dispatch, "Open", srcFile,
					new Variant(-1), new Variant(-1), new Variant(0))
					.toDispatch();
			Dispatch.call(dispatch1, "SaveAs", dstFile, new Variant(12));
//			Variant variant = new Variant(-1);
			Dispatch.call(dispatch1, "Close");
			flag = true;
		} catch (Exception exception) {
			log.error("ppt转化为html出错-->"+exception.getMessage());
		} finally {
			activexcomponent.invoke("Quit", new Variant[0]);
			ComThread.Release();
			ComThread.quitMainSTA();
		}
		return flag;
	}

	public boolean ExceltoHtml(String s, String s1) {
		 ComThread.InitSTA();
		 ActiveXComponent activexcomponent = new
		 ActiveXComponent("Excel.Application");
		 boolean flag = false;
		 try
		 {
			 activexcomponent.setProperty("Visible", new Variant(false));
			 Dispatch dispatch = activexcomponent.getProperty("Workbooks").toDispatch();
			 Dispatch dispatch1 = Dispatch.invoke(dispatch, "Open", 1, new Object[] {
					 s, new Variant(false), new Variant(true)}, new int[1]).toDispatch();
			 Dispatch.call(dispatch1, "SaveAs", s1, new Variant(44));
			 Variant variant = new Variant(false);
			 Dispatch.call(dispatch1, "Close", variant);
			 flag = true;
		 }catch(Exception exception){
			 log.error("excel转化为html出错-->"+exception.getMessage());
		 }finally{
			 activexcomponent.invoke("Quit", new Variant[0]);
			 ComThread.Release();
			 ComThread.quitMainSTA();
		 }
		 return flag;
	}

获取office文件以及pdf的文本内容

private String findDocContent(String hz, String url) {
		String docContent = null;
		File file = new File(url);
		if(".pdf".equalsIgnoreCase(hz)){
			docContent = GetDocText.getDocTextInta().getTextFromPdf(file);
		}else if(".xls".equalsIgnoreCase(hz) || ".xlsx".equalsIgnoreCase(hz)){
			docContent = GetDocText.getDocTextInta().getTextFromExcel(file);
		}else if(".doc".equalsIgnoreCase(hz) || ".docx".equalsIgnoreCase(hz)){
			docContent = GetDocText.getDocTextInta().getTextFromWord(file);
		}else if(".ppt".equalsIgnoreCase(hz) || ".pptx".equalsIgnoreCase(hz)){
			docContent = GetDocText.getDocTextInta().getTextFromPPT(file);
		}
		return docContent;
	}

具体的实现方法

/**
	 * 从word文件获取文本内容
	 * 
	 * @param wordFile
	 * @return word文件的文本内容
	 */
	public String getTextFromWord(File wordFile) {
		String wordText = "";
		InputStream is = null;
		try {  
            //word 2003: 图片不会被读取  
            is = new FileInputStream(wordFile);
            String fileName = wordFile.getName();
		    String hz = fileName.substring(fileName.lastIndexOf("."),fileName.length());
		    if(".doc".equals(hz)){
		    	WordExtractor ex = new WordExtractor(is);  
		    	wordText = ex.getText();
		    }else{
		    	OPCPackage opcPackage = POIXMLDocument.openPackage(wordFile.getAbsolutePath());
		    	POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);  
		    	wordText = extractor.getText();
		    }
              
        } catch (Exception e) {  
            e.printStackTrace();  
        }finally{
        	if(is != null){
        		try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
        	}
        }
		return wordText;
	}

	/**
	 * 从excel获取文本内容
	 * 
	 * @param excelFile
	 * @return Excel文件的文本内容
	 */
	public String getTextFromExcel(File excelFile) {
		String text = "";
		InputStream in = null;
		try {
			//创建相关的文件流对象
			in = new FileInputStream(excelFile);
		    //声明相关的工作薄对象
			Workbook wb =null;
		    //声明相关的excel抽取对象
		    ExcelExtractor extractor=null;
		    String fileName = excelFile.getName();
		    String hz = fileName.substring(fileName.lastIndexOf("."),fileName.length());
		    
		    if(hz.equals(".xls"))//针对2003版本
		    {
		    	//创建excel2003的文件文本抽取对象
		    	wb=new HSSFWorkbook(new POIFSFileSystem(in));
		    	extractor =new org.apache.poi.hssf.extractor.ExcelExtractor((HSSFWorkbook)wb);
		    }else{ //针对2007版本
		    	wb = new  XSSFWorkbook(in);
		    	//创建excel2007的文件文本抽取对象
		    	extractor =new XSSFExcelExtractor((XSSFWorkbook)wb);
		    }
		    
		    extractor.setFormulasNotResults(false);
		    //是否抽象sheet页的名称
		    extractor.setIncludeSheetNames(true);
		    //是否抽取cell的注释内容
		    extractor.setIncludeCellComments(true);
		    //获取相关的抽取文本信息
		    text = extractor.getText();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			if(in != null){
				try {
					in.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		
		return text;
	}
	/**
	 * 从ppt获取文本内容
	 * 
	 * @param pptFile
	 * @return ppt文件的文本内容
	 */
	public String getTextFromPPT(File pptFile){
		String pptText = null;
		FileInputStream fin = null;
		try {
			fin = new FileInputStream(pptFile);
			String fileName = pptFile.getName();
			String hz = fileName.substring(fileName.lastIndexOf("."),fileName.length());
			if(".ppt".equals(hz)){
				QuickButCruddyTextExtractor qct = new QuickButCruddyTextExtractor(fin);
				pptText = qct.getTextAsString();
			}else{
				OPCPackage opcPackage = POIXMLDocument.openPackage(pptFile.getAbsolutePath());
				XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(opcPackage);
				pptText = pptExtractor.getText();
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		}finally{
			if(null != fin){
				try {
					fin.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return pptText;
	}
	/**
	 * 从pdf文件获取文本内容
	 * 
	 * @param pdfFile
	 * @return pdf文件的文本内容
	 */
	public String getTextFromPdf(File pdfFile){
		String result = null;
		FileInputStream is = null;
		PDDocument document = null;
		try{
			is = new FileInputStream(pdfFile);
			PDFParser parser = new PDFParser(is);
			parser.parse();
			document = parser.getPDDocument();
			PDFTextStripper stripper = new PDFTextStripper();
			result = stripper.getText(document);
		}catch(FileNotFoundException e){
			e.printStackTrace();
		}catch(IOException e){
			e.printStackTrace();
		}finally{
			if(is != null){
				try{
					is.close();
				}catch(IOException e){
					e.printStackTrace();
				}
			}
			if(document != null){
				try{
					document.close();
				}catch(IOException ex){
					ex.printStackTrace();
				}
			}
		}
		return result;
	}
	/**
	 * 
	 * @param txtFile
	 * @return  返回txt的内容
	 */
	public String getTextFromTxt(File txtFile){
		FileReader fr;
		StringBuffer buff = new StringBuffer();
		try {
			fr = new FileReader(txtFile);
			BufferedReader br = new BufferedReader(fr);
			String temp = null;
			while((temp = br.readLine()) != null){
				buff.append(temp + "\r\n"); 
			}
			br.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		return buff.toString();
	}

对带有clob字段的实体save时,直接调用hibernate的save即可。ojdbc14.jar。 更新时的处理  如下:

public void updateKnowledge(Knowledge knowledge) {
		 try {
		    knowledge.setZsk_Description(Hibernate.createClob(" "));
		    knowledge.setZsk_Text(Hibernate.createClob(" "));
		    update(knowledge);
		    flush();
		    
		    getSession().refresh(knowledge, LockMode.UPGRADE);
		    
		    SerializableClob htmlSc=(SerializableClob)knowledge.getZsk_Description();
		    SerializableClob textSc=(SerializableClob)knowledge.getZsk_Text();
		    Clob htmlWrapclob=htmlSc.getWrappedClob();
		    Clob textWrapclob=textSc.getWrappedClob();
		    CLOB htmlClob2=(CLOB)htmlWrapclob;
		    CLOB textClob2=(CLOB)textWrapclob;
		    Writer htmlWriter=htmlClob2.getCharacterOutputStream();
		    htmlWriter.write(knowledge.getContentHtml());
		    htmlWriter.close();
		    
		    Writer textWriter=textClob2.getCharacterOutputStream();
		    textWriter.write(knowledge.getContentText());
		    textWriter.close();
		    
		    update(knowledge);
		  } catch (RuntimeException re) {
		    throw re;
		  } catch (SQLException e) {
		    e.printStackTrace();
		  } catch (IOException e) {
		    e.printStackTrace();
		  }
	}

上面几步做完,基本可以完成上传以及存入数据库,以及对带有clob文件的更新。 需要的环境  windows,jacob-1.17-M2-x64 具体的jacob下载和配置 参照网络。poi-3.9

Search

    欢迎关注我的微信公众号

    Yannis的程序人生

    Table of Contents