public class NativeTextExtractor extends DocumentProcessor implements com.exalead.util.Checkable, java.io.Serializable
DocumentProcessor.FromDataModel, DocumentProcessor.Transformer<T>
Modifier and Type | Field and Description |
---|---|
boolean |
annotateHTML |
static boolean |
DEFAULT_ANNOTATE_H_T_M_L |
static boolean |
DEFAULT_DISABLE_AUTOMATIC_H_T_M_L_D_T_D_FIX |
static boolean |
DEFAULT_EXTRACT_H_T_M_L_FORMS |
static boolean |
DEFAULT_EXTRACT_H_T_M_L_STYLES |
static boolean |
DEFAULT_EXTRACT_H_T_M_L_TABLES |
static boolean |
DEFAULT_EXTRACT_JS |
static int |
DEFAULT_MAX_H_T_M_L_ANNOTATION_DEPTH |
static boolean |
DEFAULT_SKIP_INVISIBLE_H_T_M_L_TEXT |
boolean |
disableAutomaticHTMLDTDFix |
boolean |
extractHTMLForms |
boolean |
extractHTMLStyles |
boolean |
extractHTMLTables |
boolean |
extractJs |
int |
maxHTMLAnnotationDepth |
boolean |
skipInvisibleHTMLText |
acceptCondition, dataModelClass, dataModelProperty, dataModelState, DEFAULT_DISABLED, disabled, fromDataModel, name
Constructor and Description |
---|
NativeTextExtractor() |
NativeTextExtractor(NativeTextExtractor o)
Copy constructor
|
Modifier and Type | Method and Description |
---|---|
<T> T |
accept(DocumentProcessor.Transformer<T> transformer,
T[] t) |
void |
check(boolean deep,
java.lang.String errorContext)
Checks this NativeTextExtractor.
|
static NativeTextExtractor |
fromString(java.lang.String s)
String representation of this NativeTextExtractor.
|
int |
getMaxHTMLAnnotationDepth()
Prevents new annotations from being created after @c maxHTMLAnnotationDepth HTML level.
|
boolean |
isAnnotateHTML()
Adds some stylish annotations to DocumentChunks (for HTML files only):
html:p for DocumentChunks generated from {@literal
|
boolean |
isDisableAutomaticHTMLDTDFix()
Disables automatic DTD fix on HTML documents.
|
boolean |
isExtractHTMLForms()
Add annotations on Forms, select.
|
boolean |
isExtractHTMLStyles()
Adds annotations on style attributes.
|
boolean |
isExtractHTMLTables()
Adds annotations on table, tr, td, th
|
boolean |
isExtractJs()
Tries to parse JavaScript and then extract links.
|
boolean |
isSkipInvisibleHTMLText()
Skips the invisible text.
|
NativeTextExtractor |
makeCopy()
Creates and returns a deep copy of this NativeTextExtractor.
|
static NativeTextExtractor |
readFrom(java.io.InputStream is)
Read this NativeTextExtractor from an XML fragment.
|
void |
setAnnotateHTML(boolean annotateHTML)
Adds some stylish annotations to DocumentChunks (for HTML files only):
html:p for DocumentChunks generated from {@literal
|
void |
setDisableAutomaticHTMLDTDFix(boolean disableAutomaticHTMLDTDFix)
Disables automatic DTD fix on HTML documents.
|
void |
setExtractHTMLForms(boolean extractHTMLForms)
Add annotations on Forms, select.
|
void |
setExtractHTMLStyles(boolean extractHTMLStyles)
Adds annotations on style attributes.
|
void |
setExtractHTMLTables(boolean extractHTMLTables)
Adds annotations on table, tr, td, th
|
void |
setExtractJs(boolean extractJs)
Tries to parse JavaScript and then extract links.
|
void |
setMaxHTMLAnnotationDepth(int maxHTMLAnnotationDepth)
Prevents new annotations from being created after @c maxHTMLAnnotationDepth HTML level.
|
void |
setSkipInvisibleHTMLText(boolean skipInvisibleHTMLText)
Skips the invisible text.
|
java.lang.String |
toString()
String representation of this NativeTextExtractor.
|
NativeTextExtractor |
withAcceptCondition(AcceptCondition acceptCondition) |
NativeTextExtractor |
withAnnotateHTML(boolean annotateHTML) |
NativeTextExtractor |
withAnnotateHTML(java.lang.Boolean annotateHTML) |
NativeTextExtractor |
withDataModelClass(java.lang.String dataModelClass) |
NativeTextExtractor |
withDataModelProperty(java.lang.String dataModelProperty) |
NativeTextExtractor |
withDataModelState(java.lang.String dataModelState) |
NativeTextExtractor |
withDisableAutomaticHTMLDTDFix(boolean disableAutomaticHTMLDTDFix) |
NativeTextExtractor |
withDisableAutomaticHTMLDTDFix(java.lang.Boolean disableAutomaticHTMLDTDFix) |
NativeTextExtractor |
withDisabled(boolean disabled) |
NativeTextExtractor |
withDisabled(java.lang.Boolean disabled) |
NativeTextExtractor |
withExtractHTMLForms(boolean extractHTMLForms) |
NativeTextExtractor |
withExtractHTMLForms(java.lang.Boolean extractHTMLForms) |
NativeTextExtractor |
withExtractHTMLStyles(boolean extractHTMLStyles) |
NativeTextExtractor |
withExtractHTMLStyles(java.lang.Boolean extractHTMLStyles) |
NativeTextExtractor |
withExtractHTMLTables(boolean extractHTMLTables) |
NativeTextExtractor |
withExtractHTMLTables(java.lang.Boolean extractHTMLTables) |
NativeTextExtractor |
withExtractJs(boolean extractJs) |
NativeTextExtractor |
withExtractJs(java.lang.Boolean extractJs) |
NativeTextExtractor |
withFromDataModel(DocumentProcessor fromDataModel) |
NativeTextExtractor |
withMaxHTMLAnnotationDepth(int maxHTMLAnnotationDepth) |
NativeTextExtractor |
withMaxHTMLAnnotationDepth(java.lang.Integer maxHTMLAnnotationDepth) |
NativeTextExtractor |
withName(java.lang.String name) |
NativeTextExtractor |
withSkipInvisibleHTMLText(boolean skipInvisibleHTMLText) |
NativeTextExtractor |
withSkipInvisibleHTMLText(java.lang.Boolean skipInvisibleHTMLText) |
void |
writeTo(java.io.OutputStream os)
Write this NativeTextExtractor as an XML fragment
|
getAcceptCondition, getDataModelClass, getDataModelProperty, getDataModelState, getFromDataModel, getName, isDisabled, setAcceptCondition, setDataModelClass, setDataModelProperty, setDataModelState, setDisabled, setFromDataModel, setName
public boolean annotateHTML
public static final boolean DEFAULT_ANNOTATE_H_T_M_L
public boolean skipInvisibleHTMLText
public static final boolean DEFAULT_SKIP_INVISIBLE_H_T_M_L_TEXT
public boolean extractJs
public static final boolean DEFAULT_EXTRACT_JS
public boolean extractHTMLTables
public static final boolean DEFAULT_EXTRACT_H_T_M_L_TABLES
public boolean extractHTMLStyles
public static final boolean DEFAULT_EXTRACT_H_T_M_L_STYLES
public boolean extractHTMLForms
public static final boolean DEFAULT_EXTRACT_H_T_M_L_FORMS
public int maxHTMLAnnotationDepth
public static final int DEFAULT_MAX_H_T_M_L_ANNOTATION_DEPTH
public boolean disableAutomaticHTMLDTDFix
public static final boolean DEFAULT_DISABLE_AUTOMATIC_H_T_M_L_D_T_D_FIX
public NativeTextExtractor()
public NativeTextExtractor(NativeTextExtractor o)
public NativeTextExtractor withAcceptCondition(AcceptCondition acceptCondition)
withAcceptCondition
in class DocumentProcessor
public NativeTextExtractor withName(java.lang.String name)
withName
in class DocumentProcessor
public NativeTextExtractor withDataModelState(java.lang.String dataModelState)
withDataModelState
in class DocumentProcessor
public NativeTextExtractor withFromDataModel(DocumentProcessor fromDataModel)
public NativeTextExtractor withDataModelClass(java.lang.String dataModelClass)
withDataModelClass
in class DocumentProcessor
public NativeTextExtractor withDataModelProperty(java.lang.String dataModelProperty)
withDataModelProperty
in class DocumentProcessor
public NativeTextExtractor withDisabled(boolean disabled)
withDisabled
in class DocumentProcessor
public NativeTextExtractor withDisabled(java.lang.Boolean disabled)
withDisabled
in class DocumentProcessor
public void setAnnotateHTML(boolean annotateHTML)
public boolean isAnnotateHTML()
public NativeTextExtractor withAnnotateHTML(boolean annotateHTML)
public NativeTextExtractor withAnnotateHTML(java.lang.Boolean annotateHTML)
public void setSkipInvisibleHTMLText(boolean skipInvisibleHTMLText)
public boolean isSkipInvisibleHTMLText()
public NativeTextExtractor withSkipInvisibleHTMLText(boolean skipInvisibleHTMLText)
public NativeTextExtractor withSkipInvisibleHTMLText(java.lang.Boolean skipInvisibleHTMLText)
public void setExtractJs(boolean extractJs)
public boolean isExtractJs()
public NativeTextExtractor withExtractJs(boolean extractJs)
public NativeTextExtractor withExtractJs(java.lang.Boolean extractJs)
public void setExtractHTMLTables(boolean extractHTMLTables)
public boolean isExtractHTMLTables()
public NativeTextExtractor withExtractHTMLTables(boolean extractHTMLTables)
public NativeTextExtractor withExtractHTMLTables(java.lang.Boolean extractHTMLTables)
public void setExtractHTMLStyles(boolean extractHTMLStyles)
public boolean isExtractHTMLStyles()
public NativeTextExtractor withExtractHTMLStyles(boolean extractHTMLStyles)
public NativeTextExtractor withExtractHTMLStyles(java.lang.Boolean extractHTMLStyles)
public void setExtractHTMLForms(boolean extractHTMLForms)
public boolean isExtractHTMLForms()
public NativeTextExtractor withExtractHTMLForms(boolean extractHTMLForms)
public NativeTextExtractor withExtractHTMLForms(java.lang.Boolean extractHTMLForms)
public void setMaxHTMLAnnotationDepth(int maxHTMLAnnotationDepth)
public int getMaxHTMLAnnotationDepth()
public NativeTextExtractor withMaxHTMLAnnotationDepth(int maxHTMLAnnotationDepth)
public NativeTextExtractor withMaxHTMLAnnotationDepth(java.lang.Integer maxHTMLAnnotationDepth)
public void setDisableAutomaticHTMLDTDFix(boolean disableAutomaticHTMLDTDFix)
public boolean isDisableAutomaticHTMLDTDFix()
public NativeTextExtractor withDisableAutomaticHTMLDTDFix(boolean disableAutomaticHTMLDTDFix)
public NativeTextExtractor withDisableAutomaticHTMLDTDFix(java.lang.Boolean disableAutomaticHTMLDTDFix)
public NativeTextExtractor makeCopy()
makeCopy
in class DocumentProcessor
public static NativeTextExtractor readFrom(java.io.InputStream is) throws javax.xml.bind.JAXBException
javax.xml.bind.JAXBException
public void writeTo(java.io.OutputStream os) throws javax.xml.bind.JAXBException, java.io.IOException
writeTo
in class DocumentProcessor
javax.xml.bind.JAXBException
java.io.IOException
public static NativeTextExtractor fromString(java.lang.String s) throws javax.xml.bind.JAXBException, java.io.UnsupportedEncodingException
javax.xml.bind.JAXBException
java.io.UnsupportedEncodingException
public java.lang.String toString()
toString
in class DocumentProcessor
public void check(boolean deep, java.lang.String errorContext) throws com.exalead.util.TypedException
check
in interface com.exalead.util.Checkable
check
in class DocumentProcessor
com.exalead.util.TypedException
public <T> T accept(DocumentProcessor.Transformer<T> transformer, T[] t) throws com.exalead.util.TypedException
accept
in class DocumentProcessor
com.exalead.util.TypedException
Copyright © 2021 Dassault Systèmes, All Rights Reserved.