public class HTMLRelevantContentExtractor extends DocumentProcessor implements com.exalead.util.Checkable, java.io.Serializable
Modifier and Type | Class and Description |
---|---|
static class |
HTMLRelevantContentExtractor.AnnotationsToCopy |
static class |
HTMLRelevantContentExtractor.IdsAndClassesToIgnore |
static class |
HTMLRelevantContentExtractor.IdsAndClassesToKeep |
DocumentProcessor.FromDataModel, DocumentProcessor.Transformer<T>
acceptCondition, dataModelClass, dataModelProperty, dataModelState, DEFAULT_DISABLED, disabled, fromDataModel, name
Constructor and Description |
---|
HTMLRelevantContentExtractor() |
HTMLRelevantContentExtractor(HTMLRelevantContentExtractor o)
Copy constructor
|
Modifier and Type | Method and Description |
---|---|
<T> T |
accept(DocumentProcessor.Transformer<T> transformer,
T[] t) |
void |
check(boolean deep,
java.lang.String errorContext)
Checks this HTMLRelevantContentExtractor.
|
static HTMLRelevantContentExtractor |
fromString(java.lang.String s)
String representation of this HTMLRelevantContentExtractor.
|
HTMLRelevantContentExtractor.AnnotationsToCopy |
getAnnotationsToCopy() |
int |
getClassBoost()
Each time a CSS class included in 'idsAndClassesToKeep' will be detected, the score will be increased by this value.
|
HTMLRelevantContentExtractor.IdsAndClassesToIgnore |
getIdsAndClassesToIgnore() |
HTMLRelevantContentExtractor.IdsAndClassesToKeep |
getIdsAndClassesToKeep() |
java.lang.String |
getIrrelevantChunkAnnotation()
If set, the HTMLRelevantContentExtractor will annotate each irrelevant chunk with an annotation.
|
java.lang.String |
getIrrelevantChunkContext()
Irrelevant text chunks will be copied in this context.
|
int |
getMaxWordInLinkRatio()
The maximum allowed ratio of words contained in links in a chunk of text.
|
int |
getMinParagraphWords()
The minimum number of words a <p> chunk must have to be considered as a paragraph and be boosted.
|
int |
getMinScore()
Internally, the HTMLRelevantContentExtractor assigns a score to each chunk of its input.
|
int |
getMinTitleWords()
The minimum number of words a title must have to be boosted.
|
java.lang.String |
getNewContextName()
Deprecated.
|
int |
getParagraphBoost()
Each time a paragraph will be detected, the score will be increased by this value.
|
java.lang.String |
getRelevantChunkContext()
Relevant text chunks will be copied in this context.
|
java.lang.String |
getRetrieveFieldContext()
Original text chunks will be moved in this context.
|
int |
getTitleBoost()
Each time a title will be detected, the score will be increased by this value.
|
boolean |
isKeepImages()
If true, the HTML image annotations will be kept in the new context.
|
boolean |
isKeepOnlyBestChunk()
If true, the 'relevantcontent' will only be composed by the main article of the page.
|
boolean |
isLinkAllowedInTitle()
By default, the links contained in a page title produce a malus, this can be disabled.
|
boolean |
isSkipBlockquotes()
Ability to skip HTML blockquote tags.
|
boolean |
isSkipPre()
Ability to skip HTML pre tags.
|
HTMLRelevantContentExtractor |
makeCopy()
Creates and returns a deep copy of this HTMLRelevantContentExtractor.
|
static HTMLRelevantContentExtractor |
readFrom(java.io.InputStream is)
Read this HTMLRelevantContentExtractor from an XML fragment.
|
void |
setAnnotationsToCopy(HTMLRelevantContentExtractor.AnnotationsToCopy __value) |
void |
setClassBoost(int classBoost)
Each time a CSS class included in 'idsAndClassesToKeep' will be detected, the score will be increased by this value.
|
void |
setIdsAndClassesToIgnore(HTMLRelevantContentExtractor.IdsAndClassesToIgnore __value) |
void |
setIdsAndClassesToKeep(HTMLRelevantContentExtractor.IdsAndClassesToKeep __value) |
void |
setIrrelevantChunkAnnotation(java.lang.String irrelevantChunkAnnotation)
If set, the HTMLRelevantContentExtractor will annotate each irrelevant chunk with an annotation.
|
void |
setIrrelevantChunkContext(java.lang.String irrelevantChunkContext)
Irrelevant text chunks will be copied in this context.
|
void |
setKeepImages(boolean keepImages)
If true, the HTML image annotations will be kept in the new context.
|
void |
setKeepOnlyBestChunk(boolean keepOnlyBestChunk)
If true, the 'relevantcontent' will only be composed by the main article of the page.
|
void |
setLinkAllowedInTitle(boolean linkAllowedInTitle)
By default, the links contained in a page title produce a malus, this can be disabled.
|
void |
setMaxWordInLinkRatio(int maxWordInLinkRatio)
The maximum allowed ratio of words contained in links in a chunk of text.
|
void |
setMinParagraphWords(int minParagraphWords)
The minimum number of words a <p> chunk must have to be considered as a paragraph and be boosted.
|
void |
setMinScore(int minScore)
Internally, the HTMLRelevantContentExtractor assigns a score to each chunk of its input.
|
void |
setMinTitleWords(int minTitleWords)
The minimum number of words a title must have to be boosted.
|
void |
setNewContextName(java.lang.String newContextName)
Deprecated.
|
void |
setParagraphBoost(int paragraphBoost)
Each time a paragraph will be detected, the score will be increased by this value.
|
void |
setRelevantChunkContext(java.lang.String relevantChunkContext)
Relevant text chunks will be copied in this context.
|
void |
setRetrieveFieldContext(java.lang.String retrieveFieldContext)
Original text chunks will be moved in this context.
|
void |
setSkipBlockquotes(boolean skipBlockquotes)
Ability to skip HTML blockquote tags.
|
void |
setSkipPre(boolean skipPre)
Ability to skip HTML pre tags.
|
void |
setTitleBoost(int titleBoost)
Each time a title will be detected, the score will be increased by this value.
|
java.lang.String |
toString()
String representation of this HTMLRelevantContentExtractor.
|
HTMLRelevantContentExtractor |
withAcceptCondition(AcceptCondition acceptCondition) |
HTMLRelevantContentExtractor |
withAnnotationsToCopy(java.util.Collection<StringValue> __values) |
HTMLRelevantContentExtractor |
withAnnotationsToCopy(HTMLRelevantContentExtractor.AnnotationsToCopy __value) |
HTMLRelevantContentExtractor |
withAnnotationsToCopy(StringValue... __values) |
HTMLRelevantContentExtractor |
withClassBoost(int classBoost) |
HTMLRelevantContentExtractor |
withClassBoost(java.lang.Integer classBoost) |
HTMLRelevantContentExtractor |
withDataModelClass(java.lang.String dataModelClass) |
HTMLRelevantContentExtractor |
withDataModelProperty(java.lang.String dataModelProperty) |
HTMLRelevantContentExtractor |
withDataModelState(java.lang.String dataModelState) |
HTMLRelevantContentExtractor |
withDisabled(boolean disabled) |
HTMLRelevantContentExtractor |
withDisabled(java.lang.Boolean disabled) |
HTMLRelevantContentExtractor |
withFromDataModel(DocumentProcessor fromDataModel) |
HTMLRelevantContentExtractor |
withIdsAndClassesToIgnore(java.util.Collection<StringValue> __values) |
HTMLRelevantContentExtractor |
withIdsAndClassesToIgnore(HTMLRelevantContentExtractor.IdsAndClassesToIgnore __value) |
HTMLRelevantContentExtractor |
withIdsAndClassesToIgnore(StringValue... __values) |
HTMLRelevantContentExtractor |
withIdsAndClassesToKeep(java.util.Collection<StringValue> __values) |
HTMLRelevantContentExtractor |
withIdsAndClassesToKeep(HTMLRelevantContentExtractor.IdsAndClassesToKeep __value) |
HTMLRelevantContentExtractor |
withIdsAndClassesToKeep(StringValue... __values) |
HTMLRelevantContentExtractor |
withIrrelevantChunkAnnotation(java.lang.String irrelevantChunkAnnotation) |
HTMLRelevantContentExtractor |
withIrrelevantChunkContext(java.lang.String irrelevantChunkContext) |
HTMLRelevantContentExtractor |
withKeepImages(boolean keepImages) |
HTMLRelevantContentExtractor |
withKeepImages(java.lang.Boolean keepImages) |
HTMLRelevantContentExtractor |
withKeepOnlyBestChunk(boolean keepOnlyBestChunk) |
HTMLRelevantContentExtractor |
withKeepOnlyBestChunk(java.lang.Boolean keepOnlyBestChunk) |
HTMLRelevantContentExtractor |
withLinkAllowedInTitle(boolean linkAllowedInTitle) |
HTMLRelevantContentExtractor |
withLinkAllowedInTitle(java.lang.Boolean linkAllowedInTitle) |
HTMLRelevantContentExtractor |
withMaxWordInLinkRatio(int maxWordInLinkRatio) |
HTMLRelevantContentExtractor |
withMaxWordInLinkRatio(java.lang.Integer maxWordInLinkRatio) |
HTMLRelevantContentExtractor |
withMinParagraphWords(int minParagraphWords) |
HTMLRelevantContentExtractor |
withMinParagraphWords(java.lang.Integer minParagraphWords) |
HTMLRelevantContentExtractor |
withMinScore(int minScore) |
HTMLRelevantContentExtractor |
withMinScore(java.lang.Integer minScore) |
HTMLRelevantContentExtractor |
withMinTitleWords(int minTitleWords) |
HTMLRelevantContentExtractor |
withMinTitleWords(java.lang.Integer minTitleWords) |
HTMLRelevantContentExtractor |
withName(java.lang.String name) |
HTMLRelevantContentExtractor |
withNewContextName(java.lang.String newContextName)
Deprecated.
|
HTMLRelevantContentExtractor |
withParagraphBoost(int paragraphBoost) |
HTMLRelevantContentExtractor |
withParagraphBoost(java.lang.Integer paragraphBoost) |
HTMLRelevantContentExtractor |
withRelevantChunkContext(java.lang.String relevantChunkContext) |
HTMLRelevantContentExtractor |
withRetrieveFieldContext(java.lang.String retrieveFieldContext) |
HTMLRelevantContentExtractor |
withSkipBlockquotes(boolean skipBlockquotes) |
HTMLRelevantContentExtractor |
withSkipBlockquotes(java.lang.Boolean skipBlockquotes) |
HTMLRelevantContentExtractor |
withSkipPre(boolean skipPre) |
HTMLRelevantContentExtractor |
withSkipPre(java.lang.Boolean skipPre) |
HTMLRelevantContentExtractor |
withTitleBoost(int titleBoost) |
HTMLRelevantContentExtractor |
withTitleBoost(java.lang.Integer titleBoost) |
void |
writeTo(java.io.OutputStream os)
Write this HTMLRelevantContentExtractor as an XML fragment
|
getAcceptCondition, getDataModelClass, getDataModelProperty, getDataModelState, getFromDataModel, getName, isDisabled, setAcceptCondition, setDataModelClass, setDataModelProperty, setDataModelState, setDisabled, setFromDataModel, setName
public java.lang.String relevantChunkContext
public static final java.lang.String DEFAULT_RELEVANT_CHUNK_CONTEXT
@Deprecated public java.lang.String newContextName
public static final java.lang.String DEFAULT_NEW_CONTEXT_NAME
public java.lang.String irrelevantChunkContext
public static final java.lang.String DEFAULT_IRRELEVANT_CHUNK_CONTEXT
public java.lang.String retrieveFieldContext
public static final java.lang.String DEFAULT_RETRIEVE_FIELD_CONTEXT
public java.lang.String irrelevantChunkAnnotation
public int minScore
public static final int DEFAULT_MIN_SCORE
public int minParagraphWords
public static final int DEFAULT_MIN_PARAGRAPH_WORDS
public int minTitleWords
public static final int DEFAULT_MIN_TITLE_WORDS
public boolean linkAllowedInTitle
public static final boolean DEFAULT_LINK_ALLOWED_IN_TITLE
public int paragraphBoost
public static final int DEFAULT_PARAGRAPH_BOOST
public int maxWordInLinkRatio
public static final int DEFAULT_MAX_WORD_IN_LINK_RATIO
public int titleBoost
public static final int DEFAULT_TITLE_BOOST
public int classBoost
public static final int DEFAULT_CLASS_BOOST
protected HTMLRelevantContentExtractor.IdsAndClassesToIgnore idsAndClassesToIgnore
protected HTMLRelevantContentExtractor.IdsAndClassesToKeep idsAndClassesToKeep
public boolean keepOnlyBestChunk
public static final boolean DEFAULT_KEEP_ONLY_BEST_CHUNK
public boolean skipBlockquotes
public static final boolean DEFAULT_SKIP_BLOCKQUOTES
public boolean skipPre
public static final boolean DEFAULT_SKIP_PRE
public boolean keepImages
public static final boolean DEFAULT_KEEP_IMAGES
protected HTMLRelevantContentExtractor.AnnotationsToCopy annotationsToCopy
public HTMLRelevantContentExtractor()
public HTMLRelevantContentExtractor(HTMLRelevantContentExtractor o)
public HTMLRelevantContentExtractor withAcceptCondition(AcceptCondition acceptCondition)
withAcceptCondition
in class DocumentProcessor
public HTMLRelevantContentExtractor withName(java.lang.String name)
withName
in class DocumentProcessor
public HTMLRelevantContentExtractor withDataModelState(java.lang.String dataModelState)
withDataModelState
in class DocumentProcessor
public HTMLRelevantContentExtractor withFromDataModel(DocumentProcessor fromDataModel)
public HTMLRelevantContentExtractor withDataModelClass(java.lang.String dataModelClass)
withDataModelClass
in class DocumentProcessor
public HTMLRelevantContentExtractor withDataModelProperty(java.lang.String dataModelProperty)
withDataModelProperty
in class DocumentProcessor
public HTMLRelevantContentExtractor withDisabled(boolean disabled)
withDisabled
in class DocumentProcessor
public HTMLRelevantContentExtractor withDisabled(java.lang.Boolean disabled)
withDisabled
in class DocumentProcessor
public void setRelevantChunkContext(java.lang.String relevantChunkContext)
public java.lang.String getRelevantChunkContext()
public HTMLRelevantContentExtractor withRelevantChunkContext(java.lang.String relevantChunkContext)
@Deprecated public void setNewContextName(java.lang.String newContextName)
@Deprecated public java.lang.String getNewContextName()
@Deprecated public HTMLRelevantContentExtractor withNewContextName(java.lang.String newContextName)
public void setIrrelevantChunkContext(java.lang.String irrelevantChunkContext)
public java.lang.String getIrrelevantChunkContext()
public HTMLRelevantContentExtractor withIrrelevantChunkContext(java.lang.String irrelevantChunkContext)
public void setRetrieveFieldContext(java.lang.String retrieveFieldContext)
public java.lang.String getRetrieveFieldContext()
public HTMLRelevantContentExtractor withRetrieveFieldContext(java.lang.String retrieveFieldContext)
public void setIrrelevantChunkAnnotation(java.lang.String irrelevantChunkAnnotation)
public java.lang.String getIrrelevantChunkAnnotation()
public HTMLRelevantContentExtractor withIrrelevantChunkAnnotation(java.lang.String irrelevantChunkAnnotation)
public void setMinScore(int minScore)
public int getMinScore()
public HTMLRelevantContentExtractor withMinScore(int minScore)
public HTMLRelevantContentExtractor withMinScore(java.lang.Integer minScore)
public void setMinParagraphWords(int minParagraphWords)
public int getMinParagraphWords()
public HTMLRelevantContentExtractor withMinParagraphWords(int minParagraphWords)
public HTMLRelevantContentExtractor withMinParagraphWords(java.lang.Integer minParagraphWords)
public void setMinTitleWords(int minTitleWords)
public int getMinTitleWords()
public HTMLRelevantContentExtractor withMinTitleWords(int minTitleWords)
public HTMLRelevantContentExtractor withMinTitleWords(java.lang.Integer minTitleWords)
public void setLinkAllowedInTitle(boolean linkAllowedInTitle)
public boolean isLinkAllowedInTitle()
public HTMLRelevantContentExtractor withLinkAllowedInTitle(boolean linkAllowedInTitle)
public HTMLRelevantContentExtractor withLinkAllowedInTitle(java.lang.Boolean linkAllowedInTitle)
public void setParagraphBoost(int paragraphBoost)
public int getParagraphBoost()
public HTMLRelevantContentExtractor withParagraphBoost(int paragraphBoost)
public HTMLRelevantContentExtractor withParagraphBoost(java.lang.Integer paragraphBoost)
public void setMaxWordInLinkRatio(int maxWordInLinkRatio)
public int getMaxWordInLinkRatio()
public HTMLRelevantContentExtractor withMaxWordInLinkRatio(int maxWordInLinkRatio)
public HTMLRelevantContentExtractor withMaxWordInLinkRatio(java.lang.Integer maxWordInLinkRatio)
public void setTitleBoost(int titleBoost)
public int getTitleBoost()
public HTMLRelevantContentExtractor withTitleBoost(int titleBoost)
public HTMLRelevantContentExtractor withTitleBoost(java.lang.Integer titleBoost)
public void setClassBoost(int classBoost)
public int getClassBoost()
public HTMLRelevantContentExtractor withClassBoost(int classBoost)
public HTMLRelevantContentExtractor withClassBoost(java.lang.Integer classBoost)
public HTMLRelevantContentExtractor.IdsAndClassesToIgnore getIdsAndClassesToIgnore()
public void setIdsAndClassesToIgnore(HTMLRelevantContentExtractor.IdsAndClassesToIgnore __value)
public HTMLRelevantContentExtractor withIdsAndClassesToIgnore(StringValue... __values)
public HTMLRelevantContentExtractor withIdsAndClassesToIgnore(java.util.Collection<StringValue> __values)
public HTMLRelevantContentExtractor withIdsAndClassesToIgnore(HTMLRelevantContentExtractor.IdsAndClassesToIgnore __value)
public HTMLRelevantContentExtractor.IdsAndClassesToKeep getIdsAndClassesToKeep()
public void setIdsAndClassesToKeep(HTMLRelevantContentExtractor.IdsAndClassesToKeep __value)
public HTMLRelevantContentExtractor withIdsAndClassesToKeep(StringValue... __values)
public HTMLRelevantContentExtractor withIdsAndClassesToKeep(java.util.Collection<StringValue> __values)
public HTMLRelevantContentExtractor withIdsAndClassesToKeep(HTMLRelevantContentExtractor.IdsAndClassesToKeep __value)
public void setKeepOnlyBestChunk(boolean keepOnlyBestChunk)
public boolean isKeepOnlyBestChunk()
public HTMLRelevantContentExtractor withKeepOnlyBestChunk(boolean keepOnlyBestChunk)
public HTMLRelevantContentExtractor withKeepOnlyBestChunk(java.lang.Boolean keepOnlyBestChunk)
public void setSkipBlockquotes(boolean skipBlockquotes)
public boolean isSkipBlockquotes()
public HTMLRelevantContentExtractor withSkipBlockquotes(boolean skipBlockquotes)
public HTMLRelevantContentExtractor withSkipBlockquotes(java.lang.Boolean skipBlockquotes)
public void setSkipPre(boolean skipPre)
public boolean isSkipPre()
public HTMLRelevantContentExtractor withSkipPre(boolean skipPre)
public HTMLRelevantContentExtractor withSkipPre(java.lang.Boolean skipPre)
public void setKeepImages(boolean keepImages)
public boolean isKeepImages()
public HTMLRelevantContentExtractor withKeepImages(boolean keepImages)
public HTMLRelevantContentExtractor withKeepImages(java.lang.Boolean keepImages)
public HTMLRelevantContentExtractor.AnnotationsToCopy getAnnotationsToCopy()
public void setAnnotationsToCopy(HTMLRelevantContentExtractor.AnnotationsToCopy __value)
public HTMLRelevantContentExtractor withAnnotationsToCopy(StringValue... __values)
public HTMLRelevantContentExtractor withAnnotationsToCopy(java.util.Collection<StringValue> __values)
public HTMLRelevantContentExtractor withAnnotationsToCopy(HTMLRelevantContentExtractor.AnnotationsToCopy __value)
public HTMLRelevantContentExtractor makeCopy()
makeCopy
in class DocumentProcessor
public static HTMLRelevantContentExtractor readFrom(java.io.InputStream is) throws javax.xml.bind.JAXBException
javax.xml.bind.JAXBException
public void writeTo(java.io.OutputStream os) throws javax.xml.bind.JAXBException, java.io.IOException
writeTo
in class DocumentProcessor
javax.xml.bind.JAXBException
java.io.IOException
public static HTMLRelevantContentExtractor fromString(java.lang.String s) throws javax.xml.bind.JAXBException, java.io.UnsupportedEncodingException
javax.xml.bind.JAXBException
java.io.UnsupportedEncodingException
public java.lang.String toString()
toString
in class DocumentProcessor
public void check(boolean deep, java.lang.String errorContext) throws com.exalead.util.TypedException
check
in interface com.exalead.util.Checkable
check
in class DocumentProcessor
com.exalead.util.TypedException
public <T> T accept(DocumentProcessor.Transformer<T> transformer, T[] t) throws com.exalead.util.TypedException
accept
in class DocumentProcessor
com.exalead.util.TypedException
Copyright © 2021 Dassault Systèmes, All Rights Reserved.