public abstract class ICrawler
extends java.lang.Object
implements java.io.Serializable, java.lang.Cloneable
Modifier and Type | Class and Description |
---|---|
static class |
ICrawler.MimeTypes
Java class for anonymous complex type.
|
static class |
ICrawler.SessionIdBlacklist
Java class for anonymous complex type.
|
Modifier and Type | Field and Description |
---|---|
protected java.lang.Boolean |
aggressive |
protected java.lang.String |
buildGroup |
protected java.lang.String |
crawlerServer |
protected CrawlSchedulerConfig |
crawlSchedulerConfig |
protected java.lang.Boolean |
crawlSitemaps |
protected CustomCrawlConfig |
customCrawlConfig |
protected java.lang.Boolean |
defaultAccept |
protected java.lang.Boolean |
defaultFollow |
protected java.lang.Boolean |
defaultFollowRoots |
protected java.lang.Boolean |
defaultIndex |
protected java.lang.Boolean |
disableConditionalGet |
protected java.lang.Boolean |
enableConvertProcessor |
protected java.lang.Boolean |
enableSimpleSiteCollapsing |
protected java.lang.String |
fetcher |
protected java.lang.Boolean |
ignoreRobotsTxt |
protected java.lang.Boolean |
indexRedirectSources |
protected ICrawler.MimeTypes |
mimeTypes |
protected java.lang.String |
mimeTypesMode |
protected java.lang.String |
name |
protected java.lang.Boolean |
nearDuplicateDetector |
protected java.lang.Integer |
nthreads |
protected java.lang.Boolean |
patternsDetector |
protected java.util.List<Rules> |
rules |
protected ICrawler.SessionIdBlacklist |
sessionIdBlacklist |
protected java.lang.Long |
simpleSiteCollapsingDepth |
protected java.lang.Boolean |
smartRefresh |
protected java.lang.Integer |
smartRefreshMaxAgeS |
protected java.lang.Integer |
smartRefreshMinAgeS |
protected java.lang.Boolean |
storeTextOnly |
protected java.lang.Integer |
throttleTimeMS |
Constructor and Description |
---|
ICrawler()
Creates a new
ICrawler instance. |
ICrawler(ICrawler o)
Creates a new
ICrawler instance by deeply copying a given ICrawler instance. |
Modifier and Type | Method and Description |
---|---|
ICrawler |
clone()
Creates and returns a deep copy of this object.
|
java.lang.String |
getBuildGroup()
Target build group.
|
java.lang.String |
getCrawlerServer()
Crawler server hosting this crawler.
|
CrawlSchedulerConfig |
getCrawlSchedulerConfig()
Gets the value of the crawlSchedulerConfig property.
|
CustomCrawlConfig |
getCustomCrawlConfig()
Gets the value of the customCrawlConfig property.
|
java.lang.String |
getFetcher()
Which fetcher to use.
|
ICrawler.MimeTypes |
getMimeTypes()
Gets the value of the mimeTypes property.
|
java.lang.String |
getMimeTypesMode()
Mime types white/black list (default: exclude)
|
java.lang.String |
getName()
The crawler name.
|
int |
getNthreads()
The number of crawl threads which must be strictly positive.
|
java.util.List<Rules> |
getRules()
Gets the value of the rules property.
|
ICrawler.SessionIdBlacklist |
getSessionIdBlacklist()
SessionId blacklist.
|
long |
getSimpleSiteCollapsingDepth()
How many path segments to use to generate the site collapsing ID.
|
int |
getSmartRefreshMaxAgeS()
Age in seconds at which we force the refresh of old urls.
|
int |
getSmartRefreshMinAgeS()
Age in seconds at which we may refresh old urls.
|
int |
getThrottleTimeMS()
In the case of non-aggressive crawl, this defines the sleep interval between requests to the same host.
|
boolean |
isAggressive()
Whether to enable aggressive crawl, that never sleeps between two requests to the same host.
|
boolean |
isCrawlSitemaps()
Whether to crawl sitemaps.
|
boolean |
isDefaultAccept()
Whether to crawl a url by default when it matches no other accept rule.
|
boolean |
isDefaultFollow()
Whether to follow by default when a url matches no follow rule.
|
boolean |
isDefaultFollowRoots()
Whether to automatically follow root urls (default: true)
|
boolean |
isDefaultIndex()
Whether to index by default when a url matches no index rule.
|
boolean |
isDisableConditionalGet()
Whether to always fetch documents, even if the server tells it has not changed.
|
boolean |
isEnableConvertProcessor()
Whether to enable remoteconvert-based processor for links extracting in binary documents.
|
boolean |
isEnableSimpleSiteCollapsing()
Whether to generate a site ID suitable for document collapsing.
|
boolean |
isIgnoreRobotsTxt()
Whether to ignore robots.txt rules.
|
boolean |
isIndexRedirectSources()
Whether to index redirections with target data.
|
boolean |
isNearDuplicateDetector()
Whether to enable the near-duplicate content detector.
|
boolean |
isPatternsDetector()
Whether to enable patterns detection in pages.
|
boolean |
isSmartRefresh()
Whether to crawl a fraction of refreshed urls.
|
boolean |
isStoreTextOnly()
Whether to store original binary documents, or only converted text.
|
void |
setAggressive(java.lang.Boolean value)
Sets the value of the aggressive property.
|
void |
setBuildGroup(java.lang.String value)
Sets the value of the buildGroup property.
|
void |
setCrawlerServer(java.lang.String value)
Sets the value of the crawlerServer property.
|
void |
setCrawlSchedulerConfig(CrawlSchedulerConfig value)
Sets the value of the crawlSchedulerConfig property.
|
void |
setCrawlSitemaps(java.lang.Boolean value)
Sets the value of the crawlSitemaps property.
|
void |
setCustomCrawlConfig(CustomCrawlConfig value)
Sets the value of the customCrawlConfig property.
|
void |
setDefaultAccept(java.lang.Boolean value)
Sets the value of the defaultAccept property.
|
void |
setDefaultFollow(java.lang.Boolean value)
Sets the value of the defaultFollow property.
|
void |
setDefaultFollowRoots(java.lang.Boolean value)
Sets the value of the defaultFollowRoots property.
|
void |
setDefaultIndex(java.lang.Boolean value)
Sets the value of the defaultIndex property.
|
void |
setDisableConditionalGet(java.lang.Boolean value)
Sets the value of the disableConditionalGet property.
|
void |
setEnableConvertProcessor(java.lang.Boolean value)
Sets the value of the enableConvertProcessor property.
|
void |
setEnableSimpleSiteCollapsing(java.lang.Boolean value)
Sets the value of the enableSimpleSiteCollapsing property.
|
void |
setFetcher(java.lang.String value)
Sets the value of the fetcher property.
|
void |
setIgnoreRobotsTxt(java.lang.Boolean value)
Sets the value of the ignoreRobotsTxt property.
|
void |
setIndexRedirectSources(java.lang.Boolean value)
Sets the value of the indexRedirectSources property.
|
void |
setMimeTypes(ICrawler.MimeTypes value)
Sets the value of the mimeTypes property.
|
void |
setMimeTypesMode(java.lang.String value)
Sets the value of the mimeTypesMode property.
|
void |
setName(java.lang.String value)
Sets the value of the name property.
|
void |
setNearDuplicateDetector(java.lang.Boolean value)
Sets the value of the nearDuplicateDetector property.
|
void |
setNthreads(java.lang.Integer value)
Sets the value of the nthreads property.
|
void |
setPatternsDetector(java.lang.Boolean value)
Sets the value of the patternsDetector property.
|
void |
setSessionIdBlacklist(ICrawler.SessionIdBlacklist value)
Sets the value of the sessionIdBlacklist property.
|
void |
setSimpleSiteCollapsingDepth(java.lang.Long value)
Sets the value of the simpleSiteCollapsingDepth property.
|
void |
setSmartRefresh(java.lang.Boolean value)
Sets the value of the smartRefresh property.
|
void |
setSmartRefreshMaxAgeS(java.lang.Integer value)
Sets the value of the smartRefreshMaxAgeS property.
|
void |
setSmartRefreshMinAgeS(java.lang.Integer value)
Sets the value of the smartRefreshMinAgeS property.
|
void |
setStoreTextOnly(java.lang.Boolean value)
Sets the value of the storeTextOnly property.
|
void |
setThrottleTimeMS(java.lang.Integer value)
Sets the value of the throttleTimeMS property.
|
ICrawler |
withAggressive(java.lang.Boolean value) |
ICrawler |
withBuildGroup(java.lang.String value) |
ICrawler |
withCrawlerServer(java.lang.String value) |
ICrawler |
withCrawlSchedulerConfig(CrawlSchedulerConfig value) |
ICrawler |
withCrawlSitemaps(java.lang.Boolean value) |
ICrawler |
withCustomCrawlConfig(CustomCrawlConfig value) |
ICrawler |
withDefaultAccept(java.lang.Boolean value) |
ICrawler |
withDefaultFollow(java.lang.Boolean value) |
ICrawler |
withDefaultFollowRoots(java.lang.Boolean value) |
ICrawler |
withDefaultIndex(java.lang.Boolean value) |
ICrawler |
withDisableConditionalGet(java.lang.Boolean value) |
ICrawler |
withEnableConvertProcessor(java.lang.Boolean value) |
ICrawler |
withEnableSimpleSiteCollapsing(java.lang.Boolean value) |
ICrawler |
withFetcher(java.lang.String value) |
ICrawler |
withIgnoreRobotsTxt(java.lang.Boolean value) |
ICrawler |
withIndexRedirectSources(java.lang.Boolean value) |
ICrawler |
withMimeTypes(ICrawler.MimeTypes value) |
ICrawler |
withMimeTypesMode(java.lang.String value) |
ICrawler |
withName(java.lang.String value) |
ICrawler |
withNearDuplicateDetector(java.lang.Boolean value) |
ICrawler |
withNthreads(java.lang.Integer value) |
ICrawler |
withPatternsDetector(java.lang.Boolean value) |
ICrawler |
withRules(java.util.Collection<Rules> values) |
ICrawler |
withRules(Rules... values) |
ICrawler |
withSessionIdBlacklist(ICrawler.SessionIdBlacklist value) |
ICrawler |
withSimpleSiteCollapsingDepth(java.lang.Long value) |
ICrawler |
withSmartRefresh(java.lang.Boolean value) |
ICrawler |
withSmartRefreshMaxAgeS(java.lang.Integer value) |
ICrawler |
withSmartRefreshMinAgeS(java.lang.Integer value) |
ICrawler |
withStoreTextOnly(java.lang.Boolean value) |
ICrawler |
withThrottleTimeMS(java.lang.Integer value) |
protected java.util.List<Rules> rules
protected CrawlSchedulerConfig crawlSchedulerConfig
protected CustomCrawlConfig customCrawlConfig
protected ICrawler.MimeTypes mimeTypes
protected ICrawler.SessionIdBlacklist sessionIdBlacklist
protected java.lang.String name
protected java.lang.String fetcher
protected java.lang.String crawlerServer
protected java.lang.String buildGroup
protected java.lang.Boolean storeTextOnly
protected java.lang.Integer nthreads
protected java.lang.Boolean aggressive
protected java.lang.Integer throttleTimeMS
protected java.lang.Boolean ignoreRobotsTxt
protected java.lang.Boolean enableConvertProcessor
protected java.lang.Boolean nearDuplicateDetector
protected java.lang.Boolean patternsDetector
protected java.lang.Boolean crawlSitemaps
protected java.lang.Boolean disableConditionalGet
protected java.lang.Boolean defaultAccept
protected java.lang.Boolean defaultIndex
protected java.lang.Boolean defaultFollow
protected java.lang.Boolean defaultFollowRoots
protected java.lang.Boolean enableSimpleSiteCollapsing
protected java.lang.Long simpleSiteCollapsingDepth
protected java.lang.String mimeTypesMode
protected java.lang.Boolean indexRedirectSources
protected java.lang.Boolean smartRefresh
protected java.lang.Integer smartRefreshMinAgeS
protected java.lang.Integer smartRefreshMaxAgeS
public ICrawler()
ICrawler
instance.public ICrawler(ICrawler o)
ICrawler
instance by deeply copying a given ICrawler
instance.o
- The instance to copy.java.lang.NullPointerException
- if o
is null
.public java.util.List<Rules> getRules()
This accessor method returns a reference to the live list,
not a snapshot. Therefore any modification you make to the
returned list will be present inside the JAXB object.
This is why there is not a set
method for the rules property.
For example, to add a new item, do as follows:
getRules().add(newItem);
Objects of the following type(s) are allowed in the list
Rules
public CrawlSchedulerConfig getCrawlSchedulerConfig()
CrawlSchedulerConfig
public void setCrawlSchedulerConfig(CrawlSchedulerConfig value)
value
- allowed object is
CrawlSchedulerConfig
public CustomCrawlConfig getCustomCrawlConfig()
CustomCrawlConfig
public void setCustomCrawlConfig(CustomCrawlConfig value)
value
- allowed object is
CustomCrawlConfig
public ICrawler.MimeTypes getMimeTypes()
ICrawler.MimeTypes
public void setMimeTypes(ICrawler.MimeTypes value)
value
- allowed object is
ICrawler.MimeTypes
public ICrawler.SessionIdBlacklist getSessionIdBlacklist()
ICrawler.SessionIdBlacklist
public void setSessionIdBlacklist(ICrawler.SessionIdBlacklist value)
value
- allowed object is
ICrawler.SessionIdBlacklist
public java.lang.String getName()
String
public void setName(java.lang.String value)
value
- allowed object is
String
public java.lang.String getFetcher()
String
public void setFetcher(java.lang.String value)
value
- allowed object is
String
public java.lang.String getCrawlerServer()
String
public void setCrawlerServer(java.lang.String value)
value
- allowed object is
String
public java.lang.String getBuildGroup()
String
public void setBuildGroup(java.lang.String value)
value
- allowed object is
String
public boolean isStoreTextOnly()
Boolean
public void setStoreTextOnly(java.lang.Boolean value)
value
- allowed object is
Boolean
public int getNthreads()
Integer
public void setNthreads(java.lang.Integer value)
value
- allowed object is
Integer
public boolean isAggressive()
Boolean
public void setAggressive(java.lang.Boolean value)
value
- allowed object is
Boolean
public int getThrottleTimeMS()
Integer
public void setThrottleTimeMS(java.lang.Integer value)
value
- allowed object is
Integer
public boolean isIgnoreRobotsTxt()
Boolean
public void setIgnoreRobotsTxt(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isEnableConvertProcessor()
Boolean
public void setEnableConvertProcessor(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isNearDuplicateDetector()
Boolean
public void setNearDuplicateDetector(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isPatternsDetector()
Boolean
public void setPatternsDetector(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isCrawlSitemaps()
Boolean
public void setCrawlSitemaps(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isDisableConditionalGet()
Boolean
public void setDisableConditionalGet(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isDefaultAccept()
Boolean
public void setDefaultAccept(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isDefaultIndex()
Boolean
public void setDefaultIndex(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isDefaultFollow()
Boolean
public void setDefaultFollow(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isDefaultFollowRoots()
Boolean
public void setDefaultFollowRoots(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isEnableSimpleSiteCollapsing()
Boolean
public void setEnableSimpleSiteCollapsing(java.lang.Boolean value)
value
- allowed object is
Boolean
public long getSimpleSiteCollapsingDepth()
Long
public void setSimpleSiteCollapsingDepth(java.lang.Long value)
value
- allowed object is
Long
public java.lang.String getMimeTypesMode()
String
public void setMimeTypesMode(java.lang.String value)
value
- allowed object is
String
public boolean isIndexRedirectSources()
Boolean
public void setIndexRedirectSources(java.lang.Boolean value)
value
- allowed object is
Boolean
public boolean isSmartRefresh()
Boolean
public void setSmartRefresh(java.lang.Boolean value)
value
- allowed object is
Boolean
public int getSmartRefreshMinAgeS()
Integer
public void setSmartRefreshMinAgeS(java.lang.Integer value)
value
- allowed object is
Integer
public int getSmartRefreshMaxAgeS()
Integer
public void setSmartRefreshMaxAgeS(java.lang.Integer value)
value
- allowed object is
Integer
public ICrawler withCrawlSchedulerConfig(CrawlSchedulerConfig value)
public ICrawler withCustomCrawlConfig(CustomCrawlConfig value)
public ICrawler withMimeTypes(ICrawler.MimeTypes value)
public ICrawler withSessionIdBlacklist(ICrawler.SessionIdBlacklist value)
public ICrawler withName(java.lang.String value)
public ICrawler withFetcher(java.lang.String value)
public ICrawler withCrawlerServer(java.lang.String value)
public ICrawler withBuildGroup(java.lang.String value)
public ICrawler withStoreTextOnly(java.lang.Boolean value)
public ICrawler withNthreads(java.lang.Integer value)
public ICrawler withAggressive(java.lang.Boolean value)
public ICrawler withThrottleTimeMS(java.lang.Integer value)
public ICrawler withIgnoreRobotsTxt(java.lang.Boolean value)
public ICrawler withEnableConvertProcessor(java.lang.Boolean value)
public ICrawler withNearDuplicateDetector(java.lang.Boolean value)
public ICrawler withPatternsDetector(java.lang.Boolean value)
public ICrawler withCrawlSitemaps(java.lang.Boolean value)
public ICrawler withDisableConditionalGet(java.lang.Boolean value)
public ICrawler withDefaultAccept(java.lang.Boolean value)
public ICrawler withDefaultIndex(java.lang.Boolean value)
public ICrawler withDefaultFollow(java.lang.Boolean value)
public ICrawler withDefaultFollowRoots(java.lang.Boolean value)
public ICrawler withEnableSimpleSiteCollapsing(java.lang.Boolean value)
public ICrawler withSimpleSiteCollapsingDepth(java.lang.Long value)
public ICrawler withMimeTypesMode(java.lang.String value)
public ICrawler withIndexRedirectSources(java.lang.Boolean value)
public ICrawler withSmartRefresh(java.lang.Boolean value)
public ICrawler withSmartRefreshMinAgeS(java.lang.Integer value)
public ICrawler withSmartRefreshMaxAgeS(java.lang.Integer value)
public ICrawler clone()
clone
in class java.lang.Object
Copyright © 2021 Dassault Systèmes, All Rights Reserved.