OdfEditableTextExtractor.java

/**
 * **********************************************************************
 *
 * <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * <p>http://www.apache.org/licenses/LICENSE-2.0
 *
 * <p>Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * <p>**********************************************************************
 */
package org.odftoolkit.odfdom.incubator.doc.text;

import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.odftoolkit.odfdom.doc.OdfDocument;
import org.odftoolkit.odfdom.doc.table.OdfTable;
import org.odftoolkit.odfdom.doc.table.OdfTableRow;
import org.odftoolkit.odfdom.dom.OdfContentDom;
import org.odftoolkit.odfdom.dom.OdfMetaDom;
import org.odftoolkit.odfdom.dom.OdfStylesDom;
import org.odftoolkit.odfdom.dom.element.draw.DrawObjectElement;
import org.odftoolkit.odfdom.dom.element.office.OfficeMetaElement;
import org.odftoolkit.odfdom.dom.element.style.StyleMasterPageElement;
import org.odftoolkit.odfdom.dom.element.table.TableTableElement;
import org.odftoolkit.odfdom.dom.element.text.TextAElement;
import org.odftoolkit.odfdom.dom.element.text.TextTrackedChangesElement;
import org.odftoolkit.odfdom.pkg.OdfElement;
import org.w3c.dom.NodeList;

/**
 * It's a sub class of OdfTextExtractor. It provides a method to return all the text that the user
 * can typically edit in a document, including text in cotent.xml, header and footer in styles.xml,
 * meta data in meta.xml.
 *
 * <p>This function can be used by search engine, and text analytic operations.
 */
public class OdfEditableTextExtractor extends OdfTextExtractor {

  OdfDocument mDocument = null;
  OdfElement mElement = null;
  boolean mIsDocumentExtractor = false;

  /**
   * Constructor with an ODF document as a parameter
   *
   * @param doc the ODF document whose editable text would be extracted.
   */
  private OdfEditableTextExtractor(OdfDocument doc) {
    mTextBuilder = new StringBuilder();
    mDocument = doc;
    mIsDocumentExtractor = true;
  }

  /**
   * Constructor with an ODF element as parameter
   *
   * @param element the ODF element whose editable text would be extracted.
   */
  private OdfEditableTextExtractor(OdfElement element) {
    mTextBuilder = new StringBuilder();
    mElement = element;
    mIsDocumentExtractor = false;
  }

  /**
   * An instance of OdfEditableTextExtractor will be created to extract the editable text content of
   * an ODF element.
   *
   * @param doc the ODF document whose text will be extracted.
   * @return An instance of OdfEditableTextExtractor
   */
  public static OdfEditableTextExtractor newOdfEditableTextExtractor(OdfDocument doc) {
    return new OdfEditableTextExtractor(doc);
  }

  /**
   * An instance of OdfEditableTextExtractor will be created to extract the editable text content of
   * an ODF element.
   *
   * @param element the ODF element whose text will be extracted.
   * @return An instance of OdfEditableTextExtractor
   */
  public static OdfEditableTextExtractor newOdfEditableTextExtractor(OdfElement element) {
    return new OdfEditableTextExtractor(element);
  }

  /* (non-Javadoc)
   * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.draw.DrawObjectElement)
   */
  @Override
  public void visit(DrawObjectElement element) {
    String embedDocPath = element.getXlinkHrefAttribute();
    OdfDocument embedDoc =
        ((OdfDocument) (((OdfContentDom) element.getOwnerDocument()).getDocument()))
            .loadSubDocument(embedDocPath);
    if (embedDoc != null) {
      try {
        mTextBuilder.append(
            OdfEditableTextExtractor.newOdfEditableTextExtractor(embedDoc).getText());
      } catch (Exception e) {
        Logger.getLogger(OdfEditableTextExtractor.class.getName()).log(Level.SEVERE, null, e);
      }
    }
  }

  /* (non-Javadoc)
   * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextTrackedChangesElement)
   */
  @Override
  public void visit(TextTrackedChangesElement ele) {
    return;
  }

  /* (non-Javadoc)
   * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextAElement)
   */
  @Override
  public void visit(TextAElement ele) {
    String link = ele.getXlinkHrefAttribute();
    mTextBuilder.append(link);
    appendElementText(ele);
  }

  /* (non-Javadoc)
   * @see org.odftoolkit.odfdom.dom.DefaultElementVisitor#visit(org.odftoolkit.odfdom.dom.element.text.TextTabElement)
   */
  @Override
  public void visit(TableTableElement ele) {
    OdfTable table = OdfTable.getInstance(ele);
    List<OdfTableRow> rowlist = table.getRowList();
    for (int i = 0; i < rowlist.size(); i++) {
      OdfTableRow row = rowlist.get(i);
      for (int j = 0; j < row.getCellCount(); j++) {
        mTextBuilder.append(row.getCellByIndex(j).getDisplayText()).append(TabChar);
      }
      mTextBuilder.append(NewLineChar);
    }
  }

  /**
   * Return the editable text content as a string
   *
   * @return the editable text content as a string
   */
  @Override
  public String getText() {
    if (mIsDocumentExtractor) {
      return getDocumentText();
    } else {
      visit(mElement);
      return mTextBuilder.toString();
    }
  }

  private String getDocumentText() {
    StringBuilder builder = new StringBuilder();
    try {
      // Extract text from content.xml
      OdfEditableTextExtractor contentDomExtractor =
          newOdfEditableTextExtractor(mDocument.getContentRoot());
      builder.append(contentDomExtractor.getText());

      // Extract text from style.xml
      OdfStylesDom styleDom = mDocument.getStylesDom();
      if (styleDom != null) {
        StyleMasterPageElement masterpage = null;
        NodeList list = styleDom.getElementsByTagName("style:master-page");
        if (list.getLength() > 0) {
          masterpage = (StyleMasterPageElement) list.item(0);
        }
        if (masterpage != null) {
          builder.append(newOdfEditableTextExtractor(masterpage).getText());
        }
      }

      // Extract text from meta.xml
      OdfMetaDom metaDom = mDocument.getMetaDom();
      if (metaDom != null) {
        OdfElement root = metaDom.getRootElement();
        OfficeMetaElement officemeta = OdfElement.findFirstChildNode(OfficeMetaElement.class, root);
        if (officemeta != null) {
          builder.append(newOdfEditableTextExtractor(officemeta).getText());
        }
      }

      return builder.toString();
    } catch (Exception e) {
      Logger.getLogger(OdfEditableTextExtractor.class.getName()).severe(e.getMessage());
      return builder.toString();
    }
  }
}