OdfEditableTextExtractor.java
/**
* **********************************************************************
*
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* <p>http://www.apache.org/licenses/LICENSE-2.0
*
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
* <p>**********************************************************************
*/
package org.odftoolkit.odfdom.incubator.doc.text;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.odftoolkit.odfdom.doc.OdfDocument;
import org.odftoolkit.odfdom.doc.table.OdfTable;
import org.odftoolkit.odfdom.doc.table.OdfTableRow;
import org.odftoolkit.odfdom.dom.OdfContentDom;
import org.odftoolkit.odfdom.dom.OdfMetaDom;
import org.odftoolkit.odfdom.dom.OdfStylesDom;
import org.odftoolkit.odfdom.dom.element.draw.DrawObjectElement;
import org.odftoolkit.odfdom.dom.element.office.OfficeMetaElement;
import org.odftoolkit.odfdom.dom.element.style.StyleMasterPageElement;
import org.odftoolkit.odfdom.dom.element.table.TableTableElement;
import org.odftoolkit.odfdom.dom.element.text.TextAElement;
import org.odftoolkit.odfdom.dom.element.text.TextTrackedChangesElement;
import org.odftoolkit.odfdom.pkg.OdfElement;
import org.w3c.dom.NodeList;
/**
* It's a sub class of OdfTextExtractor. It provides a method to return all the text that the user
* can typically edit in a document, including text in cotent.xml, header and footer in styles.xml,
* meta data in meta.xml.
*
* <p>This function can be used by search engine, and text analytic operations.
*/
public class OdfEditableTextExtractor extends OdfTextExtractor {
OdfDocument mDocument = null;
OdfElement mElement = null;
boolean mIsDocumentExtractor = false;
/**
* Constructor with an ODF document as a parameter
*
* @param doc the ODF document whose editable text would be extracted.
*/
private OdfEditableTextExtractor(OdfDocument doc) {
mTextBuilder = new StringBuilder();
mDocument = doc;
mIsDocumentExtractor = true;
}
/**
* Constructor with an ODF element as parameter
*
* @param element the ODF element whose editable text would be extracted.
*/
private OdfEditableTextExtractor(OdfElement element) {
mTextBuilder = new StringBuilder();
mElement = element;
mIsDocumentExtractor = false;
}
/**
* An instance of OdfEditableTextExtractor will be created to extract the editable text content of
* an ODF element.
*
* @param doc the ODF document whose text will be extracted.
* @return An instance of OdfEditableTextExtractor
*/
public static OdfEditableTextExtractor newOdfEditableTextExtractor(OdfDocument doc) {
return new OdfEditableTextExtractor(doc);
}
/**
* An instance of OdfEditableTextExtractor will be created to extract the editable text content of
* an ODF element.
*
* @param element the ODF element whose text will be extracted.
* @return An instance of OdfEditableTextExtractor
*/
public static OdfEditableTextExtractor newOdfEditableTextExtractor(OdfElement element) {
return new OdfEditableTextExtractor(element);
}
/* (non-Javadoc)
* @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.draw.DrawObjectElement)
*/
@Override
public void visit(DrawObjectElement element) {
String embedDocPath = element.getXlinkHrefAttribute();
OdfDocument embedDoc =
((OdfDocument) (((OdfContentDom) element.getOwnerDocument()).getDocument()))
.loadSubDocument(embedDocPath);
if (embedDoc != null) {
try {
mTextBuilder.append(
OdfEditableTextExtractor.newOdfEditableTextExtractor(embedDoc).getText());
} catch (Exception e) {
Logger.getLogger(OdfEditableTextExtractor.class.getName()).log(Level.SEVERE, null, e);
}
}
}
/* (non-Javadoc)
* @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextTrackedChangesElement)
*/
@Override
public void visit(TextTrackedChangesElement ele) {
return;
}
/* (non-Javadoc)
* @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextAElement)
*/
@Override
public void visit(TextAElement ele) {
String link = ele.getXlinkHrefAttribute();
mTextBuilder.append(link);
appendElementText(ele);
}
/* (non-Javadoc)
* @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextTabElement)
*/
@Override
public void visit(TableTableElement ele) {
OdfTable table = OdfTable.getInstance(ele);
List<OdfTableRow> rowlist = table.getRowList();
for (int i = 0; i < rowlist.size(); i++) {
OdfTableRow row = rowlist.get(i);
for (int j = 0; j < row.getCellCount(); j++) {
mTextBuilder.append(row.getCellByIndex(j).getDisplayText()).append(TabChar);
}
mTextBuilder.append(NewLineChar);
}
}
/**
* Return the editable text content as a string
*
* @return the editable text content as a string
*/
@Override
public String getText() {
if (mIsDocumentExtractor) {
return getDocumentText();
} else {
visit(mElement);
return mTextBuilder.toString();
}
}
private String getDocumentText() {
StringBuilder builder = new StringBuilder();
try {
// Extract text from content.xml
OdfEditableTextExtractor contentDomExtractor =
newOdfEditableTextExtractor(mDocument.getContentRoot());
builder.append(contentDomExtractor.getText());
// Extract text from style.xml
OdfStylesDom styleDom = mDocument.getStylesDom();
if (styleDom != null) {
StyleMasterPageElement masterpage = null;
NodeList list = styleDom.getElementsByTagName("style:master-page");
if (list.getLength() > 0) {
masterpage = (StyleMasterPageElement) list.item(0);
}
if (masterpage != null) {
builder.append(newOdfEditableTextExtractor(masterpage).getText());
}
}
// Extract text from meta.xml
OdfMetaDom metaDom = mDocument.getMetaDom();
if (metaDom != null) {
OdfElement root = metaDom.getRootElement();
OfficeMetaElement officemeta = OdfElement.findFirstChildNode(OfficeMetaElement.class, root);
if (officemeta != null) {
builder.append(newOdfEditableTextExtractor(officemeta).getText());
}
}
return builder.toString();
} catch (Exception e) {
Logger.getLogger(OdfEditableTextExtractor.class.getName()).severe(e.getMessage());
return builder.toString();
}
}
}