protected static class WorkerThread.ProcessActivity extends Object implements IProcessActivity
_rcsid
BAD_URL, EXCLUDED_CONTENT, EXCLUDED_DATE, EXCLUDED_LENGTH, EXCLUDED_MIMETYPE, EXCLUDED_URL, NULL_URL
Constructor and Description |
---|
WorkerThread.ProcessActivity(Long jobID,
String processID,
IReprioritizationTracker rt,
IJobManager jobManager,
IIncrementalIngester ingester,
String connectionName,
IPipelineSpecification pipelineSpecification,
Map<String,QueuedDocument> previousDocuments,
long currentTime,
Long expireInterval,
Map<String,Set<String>> forcedMetadata,
Long recrawlInterval,
Long maxInterval,
int hopcountMode,
IRepositoryConnection connection,
IRepositoryConnector connector,
IRepositoryConnectionManager connMgr,
String[] legalLinkTypes,
WorkerThread.OutputActivity ingestLogger,
String parameterVersion)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
void |
addDocumentReference(String localIdentifier)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType,
String[] dataNames,
Object[][] dataValues)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType,
String[] dataNames,
Object[][] dataValues,
Long originationTime)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType,
String[] dataNames,
Object[][] dataValues,
Long originationTime,
String[] prereqEventNames)
Add a document description to the current job's queue.
|
boolean |
beginEventSequence(String eventName)
Begin an event sequence.
|
Long |
calculateDocumentExpireTime(long currentTime,
String localIdentifier) |
Long |
calculateDocumentRescheduleTime(long currentTime,
long timeAmt,
String localIdentifier) |
protected void |
checkAllComponentsMultipleDispositions(String documentIdentifier) |
boolean |
checkDateIndexable(Date date)
Detect if a date is indexable or not.
|
boolean |
checkDocumentIndexable(File localFile)
Check whether a document is indexable by the currently specified output connector.
|
boolean |
checkDocumentNeedsReindexing(String documentIdentifier,
String newVersionString)
Check if a document needs to be reindexed, based on a computed version string.
|
boolean |
checkDocumentNeedsReindexing(String documentIdentifier,
String componentIdentifier,
String newVersionString)
Check if a document needs to be reindexed, based on a computed version string.
|
void |
checkJobStillActive()
Check whether current job is still active.
|
boolean |
checkLengthIndexable(long length)
Check whether a document of a specified length is indexable by the currently specified output connector.
|
boolean |
checkMimeTypeIndexable(String mimeType)
Check whether a mime type is indexable by the currently specified output connector.
|
protected void |
checkMultipleDispositions(String documentIdentifier,
String componentIdentifier,
String componentIdentifierHash) |
boolean |
checkURLIndexable(String url)
Pre-determine whether a document's URL is indexable by this connector.
|
void |
completeEventSequence(String eventName)
Complete an event sequence.
|
protected IPipelineSpecificationWithVersions |
computePipelineSpecificationWithVersions(String documentIdentifierHash,
String componentIdentifierHash,
String documentIdentifier) |
String |
createConnectionSpecificString(String simpleString)
Create a connection-specific string from a simple string.
|
String |
createGlobalString(String simpleString)
Create a global string from a simple string.
|
String |
createJobSpecificString(String simpleString)
Create a job-based string from a simple string.
|
void |
deleteDocument(String documentIdentifier)
Delete the specified document from the search engine index, and from the status table.
|
void |
deleteDocument(String documentIdentifier,
String version)
Deprecated.
|
void |
discard()
Clean up any dangling information, before abandoning this process activity object
|
void |
flush()
Flush the outstanding references into the database.
|
Long |
getDocumentExpirationLowerBoundTime(String localIdentifier)
Find a document's lower expiration time bound, if any
|
Long |
getDocumentExpirationUpperBoundTime(String localIdentifier)
Find a document's upper expiration time bound, if any
|
Long |
getDocumentOriginationTime(String localIdentifier)
Get a document's origination time
|
Long |
getDocumentRescheduleLowerBoundTime(String localIdentifier)
Find a document's lower rescheduling time bound, if any
|
Long |
getDocumentRescheduleUpperBoundTime(String localIdentifier)
Find a document's upper rescheduling time bound, if any
|
void |
ingestDocument(String localIdentifier,
String version,
String documentURI,
RepositoryDocument data)
Deprecated.
|
void |
ingestDocumentWithException(String documentIdentifier,
String version,
String documentURI,
RepositoryDocument data)
Ingest the current document.
|
void |
ingestDocumentWithException(String documentIdentifier,
String componentIdentifier,
String version,
String documentURI,
RepositoryDocument data)
Ingest the current document.
|
void |
noDocument(String documentIdentifier,
String version)
Remove the specified document from the search engine index, while keeping track of the version information
for it (to reduce churn).
|
void |
noDocument(String documentIdentifier,
String componentIdentifier,
String version)
Remove the specified document from the search engine index, and update the
recorded version information for the document.
|
protected void |
processDocumentReferences()
Process outstanding document references, in batch.
|
void |
recordActivity(Long startTime,
String activityType,
Long dataSize,
String entityIdentifier,
String resultCode,
String resultDescription,
String[] childIdentifiers)
Record time-stamped information about the activity of the connector.
|
void |
recordDocument(String documentIdentifier,
String version)
Record a document version, but don't ingest it.
|
void |
recordDocument(String documentIdentifier,
String componentIdentifier,
String version)
Record a document version, WITHOUT reindexing it, or removing it.
|
void |
removeDocument(String documentIdentifier)
Remove the specified document primary component permanently from the search engine index,
and from the status table.
|
void |
resetTimes()
Reset the recorded times
|
void |
retainAllComponentDocument(String documentIdentifier)
Retain all existing document components of a primary document.
|
void |
retainDocument(String documentIdentifier,
String componentIdentifier)
Retain existing document component.
|
String[] |
retrieveParentData(String localIdentifier,
String dataName)
Retrieve data passed from parents to a specified child document.
|
CharacterInput[] |
retrieveParentDataAsFiles(String localIdentifier,
String dataName)
Retrieve data passed from parents to a specified child document.
|
void |
retryDocumentProcessing(String localIdentifier)
Abort processing a document (for sequencing reasons).
|
void |
setDocumentOriginationTime(String localIdentifier,
Long originationTime)
Override a document's origination time.
|
void |
setDocumentScheduleBounds(String localIdentifier,
Long lowerRecrawlBoundTime,
Long upperRecrawlBoundTime,
Long lowerExpireBoundTime,
Long upperExpireBoundTime)
Override the schedule for the next time a document is crawled.
|
protected void |
touchAllComponentsSet(String documentIdentifier) |
protected void |
touchComponentSet(String documentIdentifier,
String componentIdentifierHash) |
boolean |
wasDocumentAborted(String documentIdentifier)
Check whether a document was aborted or not.
|
boolean |
wasDocumentComponentTouched(String documentIdentifier,
String componentIdentifierHash)
Check whether a document component was touched or not.
|
boolean |
wasDocumentDeleted(String documentIdentifier)
Check whether document was deleted or not.
|
boolean |
wasDocumentTouched(String documentIdentifier)
Check whether a document (and its version string) was touched or not.
|
protected final Long jobID
protected final String processID
protected final IJobManager jobManager
protected final IIncrementalIngester ingester
protected final String connectionName
protected final IPipelineSpecification pipelineSpecification
protected final Map<String,QueuedDocument> previousDocuments
protected final long currentTime
protected final Long expireInterval
protected final Long recrawlInterval
protected final Long maxInterval
protected final int hopcountMode
protected final IRepositoryConnection connection
protected final IRepositoryConnector connector
protected final IRepositoryConnectionManager connMgr
protected final String[] legalLinkTypes
protected final WorkerThread.OutputActivity ingestLogger
protected final IReprioritizationTracker rt
protected final String parameterVersion
protected final Map<WorkerThread.DocumentReference,WorkerThread.DocumentReference> referenceList
public WorkerThread.ProcessActivity(Long jobID, String processID, IReprioritizationTracker rt, IJobManager jobManager, IIncrementalIngester ingester, String connectionName, IPipelineSpecification pipelineSpecification, Map<String,QueuedDocument> previousDocuments, long currentTime, Long expireInterval, Map<String,Set<String>> forcedMetadata, Long recrawlInterval, Long maxInterval, int hopcountMode, IRepositoryConnection connection, IRepositoryConnector connector, IRepositoryConnectionManager connMgr, String[] legalLinkTypes, WorkerThread.OutputActivity ingestLogger, String parameterVersion)
jobManager
- is the job manageringester
- is the ingesterpublic void discard() throws ManifoldCFException
ManifoldCFException
public boolean wasDocumentTouched(String documentIdentifier)
public boolean wasDocumentComponentTouched(String documentIdentifier, String componentIdentifierHash)
public boolean wasDocumentDeleted(String documentIdentifier)
public boolean wasDocumentAborted(String documentIdentifier)
public boolean checkDocumentNeedsReindexing(String documentIdentifier, String newVersionString) throws ManifoldCFException
checkDocumentNeedsReindexing
in interface IProcessActivity
documentIdentifier
- is the document identifier.newVersionString
- is the newly-computed version string.ManifoldCFException
public boolean checkDocumentNeedsReindexing(String documentIdentifier, String componentIdentifier, String newVersionString) throws ManifoldCFException
checkDocumentNeedsReindexing
in interface IProcessActivity
documentIdentifier
- is the document identifier.componentIdentifier
- is the component document identifier, if any.newVersionString
- is the newly-computed version string.ManifoldCFException
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType, String[] dataNames, Object[][] dataValues, Long originationTime, String[] prereqEventNames) throws ManifoldCFException
addDocumentReference
in interface IProcessActivity
localIdentifier
- is the local document identifier to add (for the connector that
fetched the document).parentIdentifier
- is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType
- is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.dataNames
- is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!dataValues
- are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
The type of each object must either be a String, or a CharacterInput.originationTime
- is the time, in ms since epoch, that the document originated. Pass null if none or unknown.prereqEventNames
- are the names of the prerequisite events which this document requires prior to processing. Pass null if none.ManifoldCFException
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType, String[] dataNames, Object[][] dataValues, Long originationTime) throws ManifoldCFException
addDocumentReference
in interface IProcessActivity
localIdentifier
- is the local document identifier to add (for the connector that
fetched the document).parentIdentifier
- is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType
- is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.dataNames
- is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!dataValues
- are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.originationTime
- is the time, in ms since epoch, that the document originated. Pass null if none or unknown.ManifoldCFException
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType, String[] dataNames, Object[][] dataValues) throws ManifoldCFException
addDocumentReference
in interface IProcessActivity
localIdentifier
- is the local document identifier to add (for the connector that
fetched the document).parentIdentifier
- is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType
- is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.dataNames
- is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!dataValues
- are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.ManifoldCFException
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType) throws ManifoldCFException
addDocumentReference
in interface IProcessActivity
localIdentifier
- is the local document identifier to add (for the connector that
fetched the document).parentIdentifier
- is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType
- is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.ManifoldCFException
public void addDocumentReference(String localIdentifier) throws ManifoldCFException
addDocumentReference
in interface IProcessActivity
localIdentifier
- is the local document identifier to add (for the connector that
fetched the document).ManifoldCFException
public String[] retrieveParentData(String localIdentifier, String dataName) throws ManifoldCFException
retrieveParentData
in interface ICarrydownActivity
localIdentifier
- is the document identifier of the document we want the recorded data for.dataName
- is the name of the data items to retrieve.ManifoldCFException
public CharacterInput[] retrieveParentDataAsFiles(String localIdentifier, String dataName) throws ManifoldCFException
retrieveParentDataAsFiles
in interface ICarrydownActivity
localIdentifier
- is the document identifier of the document we want the recorded data for.dataName
- is the name of the data items to retrieve.ManifoldCFException
public void recordDocument(String documentIdentifier, String version) throws ManifoldCFException
recordDocument
in interface IProcessActivity
documentIdentifier
- is the document identifier.version
- is the document version.ManifoldCFException
public void recordDocument(String documentIdentifier, String componentIdentifier, String version) throws ManifoldCFException
recordDocument
in interface IProcessActivity
documentIdentifier
- is the document identifier.componentIdentifier
- is the component document identifier, if any.version
- is the document version.ManifoldCFException
@Deprecated public void ingestDocument(String localIdentifier, String version, String documentURI, RepositoryDocument data) throws ManifoldCFException, ServiceInterruption
ingestDocument
in interface IProcessActivity
localIdentifier
- is the document's local identifier.version
- is the version of the document, as reported by the getDocumentVersions() method of the
corresponding repository connector.documentURI
- is the URI to use to retrieve this document from the search interface (and is
also the unique key in the index).data
- is the document data. The data is closed after ingestion is complete.
NOTE: Any data stream IOExceptions will be converted to ManifoldCFExceptions and ServiceInterruptions
according to standard best practices.ManifoldCFException
ServiceInterruption
public void ingestDocumentWithException(String documentIdentifier, String version, String documentURI, RepositoryDocument data) throws ManifoldCFException, ServiceInterruption, IOException
ingestDocumentWithException
in interface IProcessActivity
documentIdentifier
- is the document's local identifier.version
- is the version of the document, as reported by the getDocumentVersions() method of the
corresponding repository connector.documentURI
- is the URI to use to retrieve this document from the search interface (and is
also the unique key in the index).data
- is the document data. The data is closed after ingestion is complete.IOException
- only when data stream reading fails.ManifoldCFException
ServiceInterruption
public void ingestDocumentWithException(String documentIdentifier, String componentIdentifier, String version, String documentURI, RepositoryDocument data) throws ManifoldCFException, ServiceInterruption, IOException
ingestDocumentWithException
in interface IProcessActivity
documentIdentifier
- is the document's identifier.componentIdentifier
- is the component document identifier, if any.version
- is the version of the document, as reported by the getDocumentVersions() method of the
corresponding repository connector.documentURI
- is the URI to use to retrieve this document from the search interface (and is
also the unique key in the index).data
- is the document data. The data is closed after ingestion is complete.IOException
- only when data stream reading fails.ManifoldCFException
ServiceInterruption
public void noDocument(String documentIdentifier, String version) throws ManifoldCFException, ServiceInterruption
noDocument
in interface IProcessActivity
documentIdentifier
- is the document's local identifier.version
- is the version string to be recorded for the document.ManifoldCFException
ServiceInterruption
public void noDocument(String documentIdentifier, String componentIdentifier, String version) throws ManifoldCFException, ServiceInterruption
noDocument
in interface IProcessActivity
documentIdentifier
- is the document's local identifier.componentIdentifier
- is the component document identifier, if any.version
- is the version string to be recorded for the document.ManifoldCFException
ServiceInterruption
public void removeDocument(String documentIdentifier) throws ManifoldCFException, ServiceInterruption
removeDocument
in interface IProcessActivity
documentIdentifier
- is the document's identifier.ManifoldCFException
ServiceInterruption
public void retainDocument(String documentIdentifier, String componentIdentifier) throws ManifoldCFException
retainDocument
in interface IProcessActivity
documentIdentifier
- is the document's identifier.componentIdentifier
- is the component document identifier, which cannot be null.ManifoldCFException
@Deprecated public void deleteDocument(String documentIdentifier, String version) throws ManifoldCFException, ServiceInterruption
deleteDocument
in interface IProcessActivity
documentIdentifier
- is the document's local identifier.version
- is the version string to be recorded for the document.ManifoldCFException
ServiceInterruption
public void retainAllComponentDocument(String documentIdentifier) throws ManifoldCFException
retainAllComponentDocument
in interface IProcessActivity
documentIdentifier
- is the document's identifier.ManifoldCFException
public void deleteDocument(String documentIdentifier) throws ManifoldCFException
deleteDocument
in interface IProcessActivity
documentIdentifier
- is the document's identifier.ManifoldCFException
public void setDocumentScheduleBounds(String localIdentifier, Long lowerRecrawlBoundTime, Long upperRecrawlBoundTime, Long lowerExpireBoundTime, Long upperExpireBoundTime) throws ManifoldCFException
setDocumentScheduleBounds
in interface IProcessActivity
localIdentifier
- is the document's local identifier.lowerRecrawlBoundTime
- is the time in ms since epoch that the reschedule time should not fall BELOW, or null if none.upperRecrawlBoundTime
- is the time in ms since epoch that the reschedule time should not rise ABOVE, or null if none.lowerExpireBoundTime
- is the time in ms since epoch that the expire time should not fall BELOW, or null if none.upperExpireBoundTime
- is the time in ms since epoch that the expire time should not rise ABOVE, or null if none.ManifoldCFException
public void setDocumentOriginationTime(String localIdentifier, Long originationTime) throws ManifoldCFException
setDocumentOriginationTime
in interface IProcessActivity
localIdentifier
- is the document's local identifier.originationTime
- is the document's origination time, or null if unknown.ManifoldCFException
public Long getDocumentRescheduleLowerBoundTime(String localIdentifier)
public Long getDocumentRescheduleUpperBoundTime(String localIdentifier)
public Long getDocumentExpirationLowerBoundTime(String localIdentifier)
public Long getDocumentExpirationUpperBoundTime(String localIdentifier)
public Long getDocumentOriginationTime(String localIdentifier)
public Long calculateDocumentRescheduleTime(long currentTime, long timeAmt, String localIdentifier)
public Long calculateDocumentExpireTime(long currentTime, String localIdentifier)
public void resetTimes()
public void recordActivity(Long startTime, String activityType, Long dataSize, String entityIdentifier, String resultCode, String resultDescription, String[] childIdentifiers) throws ManifoldCFException
recordActivity
in interface IHistoryActivity
startTime
- is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every
activity has an associated time; the startTime field records when the activity began. A null value
indicates that the start time and the finishing time are the same.activityType
- is a string which is fully interpretable only in the context of the connector involved, which is
used to categorize what kind of activity is being recorded. For example, a web connector might record a
"fetch document" activity. Cannot be null.dataSize
- is the number of bytes of data involved in the activity, or null if not applicable.entityIdentifier
- is a (possibly long) string which identifies the object involved in the history record.
The interpretation of this field will differ from connector to connector. May be null.resultCode
- contains a terse description of the result of the activity. The description is limited in
size to 255 characters, and can be interpreted only in the context of the current connector. May be null.resultDescription
- is a (possibly long) human-readable string which adds detail, if required, to the result
described in the resultCode field. This field is not meant to be queried on. May be null.childIdentifiers
- is a set of child entity identifiers associated with this activity. May be null.ManifoldCFException
public void flush() throws ManifoldCFException
ManifoldCFException
protected void processDocumentReferences() throws ManifoldCFException
ManifoldCFException
public void checkJobStillActive() throws ManifoldCFException, ServiceInterruption
checkJobStillActive
in interface IAbortActivity
ManifoldCFException
ServiceInterruption
public boolean beginEventSequence(String eventName) throws ManifoldCFException
beginEventSequence
in interface IEventActivity
eventName
- is the event name.ManifoldCFException
public void completeEventSequence(String eventName) throws ManifoldCFException
completeEventSequence
in interface IEventActivity
eventName
- is the event name.ManifoldCFException
public void retryDocumentProcessing(String localIdentifier) throws ManifoldCFException
retryDocumentProcessing
in interface IEventActivity
localIdentifier
- is the document identifier to requeueManifoldCFException
public boolean checkDateIndexable(Date date) throws ManifoldCFException, ServiceInterruption
checkDateIndexable
in interface IFingerprintActivity
date
- is the date of the document; may be nullManifoldCFException
ServiceInterruption
public boolean checkMimeTypeIndexable(String mimeType) throws ManifoldCFException, ServiceInterruption
checkMimeTypeIndexable
in interface IFingerprintActivity
mimeType
- is the mime type to check, not including any character set specification.ManifoldCFException
ServiceInterruption
public boolean checkDocumentIndexable(File localFile) throws ManifoldCFException, ServiceInterruption
checkDocumentIndexable
in interface IFingerprintActivity
localFile
- is the local copy of the file to check.ManifoldCFException
ServiceInterruption
public boolean checkLengthIndexable(long length) throws ManifoldCFException, ServiceInterruption
checkLengthIndexable
in interface IFingerprintActivity
length
- is the length to check.ManifoldCFException
ServiceInterruption
public boolean checkURLIndexable(String url) throws ManifoldCFException, ServiceInterruption
checkURLIndexable
in interface IFingerprintActivity
url
- is the URL of the document.ManifoldCFException
ServiceInterruption
public String createGlobalString(String simpleString)
createGlobalString
in interface INamingActivity
simpleString
- is the simple string.public String createConnectionSpecificString(String simpleString)
createConnectionSpecificString
in interface INamingActivity
simpleString
- is the simple string.public String createJobSpecificString(String simpleString)
createJobSpecificString
in interface INamingActivity
simpleString
- is the simple string.protected void checkAllComponentsMultipleDispositions(String documentIdentifier)
protected void checkMultipleDispositions(String documentIdentifier, String componentIdentifier, String componentIdentifierHash)
protected void touchAllComponentsSet(String documentIdentifier)
protected void touchComponentSet(String documentIdentifier, String componentIdentifierHash)
protected IPipelineSpecificationWithVersions computePipelineSpecificationWithVersions(String documentIdentifierHash, String componentIdentifierHash, String documentIdentifier)