#This sample configuration file allows the Index Server #Companion to retrieve the content from a website as well as the SampleProject.mdb Microsoft Access database. #Note that the setting name and the setting value MUST be #separated by a space, an equals sign and another space. ################################################## # # # General application configuration options # # # ################################################## #Name of the project [ProjectName] = SampleProject #Whether the Indexing Service Companion should create a log file [CreateLogFile] = yes #The output folder for the Indexing Service Companion log files #Logs are saved with the filename _YYYY-MM-DD HH-MM-SS.log #The log file directory must actually exist, or the log file will not be created #The log file directory should either be a full path, e.g. #C:\logs\IndexServerCompanion #or a relative path from the working directory that the Index Server Companion is #run from, e.g. #logs [LogFileDir] = SampleProject\Logs #The folder in which the Index Server Companion's Information Store is contained #The folder name should either be a full path, e.g. #C:\Programs\IndexServerCompanion\InfoStore #or a relative path from the working directory that the Index Server Companion is #run from, e.g. #InfoStore [InfoStoreDir] = SampleProject\InformationStore #The folder in which the Index Server Companion should place the output files #that are to be indexed by Index Server. Note that the output folder does not #need to be accessible from the web #The folder name should either be a full path, e.g. #D:\Inetpub\IndexServerFiles\mywebsite #or a relative path from the working directory that the Index Server Companion is #run from, e.g. #output [OutputDir] = SampleProject\Output #If set to 'yes' then the script will output its status to the command console [Verbose] = yes #If a filename is specified using the TableOfContentsPage configuration option #then a basic table of contents page is produced for the website that is crawled #by the Index Server Companion. To get the most out of this feature, it is important #that the website contains accurate HTML title tags. #[TableOfContentsPage] = C:\temp\TableOfContents.html #This experimental feature extracts content from all of the HTML files retrieved #by the web crawling process then creates a summary page from the content. #[SummaryPage] = C:\temp\Summary.html ################################################## # # # Website data extraction configuration options # # # ################################################## #The URL that the web robot should start from when indexing pages on the web server #The web robot can normally start from the home page of a site, although it is often #better to let it start at a page which has the largest possible number of links to #the rest of the pages within a site. A table of contents page (if present) is often #a good place to start. [StartURL] = http://www.aspalliance.com/brettb/Default.asp #Use a base URL if you only want a particular part of a site to be crawled [BaseURL] = http://www.aspalliance.com/brettb/ #Web robots sometimes have problems in visiting all of the URLs in a site #so this configuration option will enable the Index Server Companion to #explore the site from more than one start URL. There is no limit to the #number of AdditionalURL configuration options added, but they must reside on #the same website as specified in the StartURL configuration option. [AdditionalURL] = http://www.aspalliance.com/brettb/What'sNew.aspx #The value to be used for the HTTP_USER_AGENT HTTP variable by the #Index Server Companion. Note that when you are using a web crawler #on third party websites, it is considered best practice to include #your email address in the User Agent variable so that the administrator #of the site can contact should problems arise. [UserAgent] = Index Server Companion 1.4 (admin@server.com) #If set to 'yes' then the web crawler will avoid indexing certain files according to #instructions found in the site's robots.txt file in accordance with the #Robots Exclusion Standard specification described at #http://www.robotstxt.org/wc/norobots.html #Unless you are crawling your own website, you are strongly advised to leave this #configuration option set to 'yes'. [UseRobotsTextFile] = yes #If set to 'yes' then the web crawler will avoid indexing and/or following #links from pages according to the contents of a 'robots' meta tag. Unlike #robots.txt files, the 'robots' meta tag is page specific. The two commands #understood by the Indexing Service Companion are 'noindex' and 'nofollow'. [UseRobotsMetaTag] = yes #The IgnoreURLsString option is useful if you are crawling a site that does not #have a robots.txt file or you are unable to modify the file. It contains a Perl #regular expression that if matched in a URL will prevent the Index Server #Companion from retrieving or following links from that URL. #The regular expression is not case sensitive. #[IgnoreURLsString] = test|testing #If set to 'yes' then the web robot will treat URLs with different QueryStrings #as being distinct URLs. This feature is useful for indexing content from #custom web applications, but be aware that for normal websites it will lead to #duplicate entries in the search results. [UseURLQueryStrings] = no #If set to 'yes' then the Index Server Companion will purge the details #of visits to URLs and databases from the information store and delete all of #the pages stored on disk. As such this can be used to reset the project. [ResetProject] = no #If CrawlType is set to 'incremental' then only pages that have changed are saved #to disk. If set to 'full' then all of the pages are updated, regardless of whether #the content has changed. [CrawlType] = incremental #The number of seconds the web crawler waits between each request to a web server #If extracting content from a remote site, set this to a large value to avoid #upsetting the server's administrator, and also to prevent bandwidth problems [SleepTime] = 4 #Number of seconds before a request to a website should timeout [TimeOutTime] = 25 #The maximum size of content of a single file retrieved by the web robot (in Kilobytes) [MaxURLSize] = 1024 #The maximum number of URLs that will be retrieved by the web robot #It is advisable to set this value so that an unattended web robot will #not accidentally retrieve a large number of URLs and fill all available #disk space [MaxNumberOfURLs] = 4096 #A list of URL extensions that the web crawler should save content from #each should be separated by a single space. Default options are .htm, .html, #.asp, .aspx, .jsp, .php and .cfm #Note that binary files such as PDF documents should be specified under #the FileExtensions option [URLExtensions] = .htm .html .asp .aspx .jsp .php .cfm #A list of file extensions that the web crawler should save content from #each should be separated by a single space. Default options are .doc and .pdf #Note that Index Server doesn't index Adobe Acrobat PDF files unless you install #the IFilter (from the Adobe website) [FileExtensions] = .doc .pdf .rtf #If set to yes, then the HTML tag of files saved from the web crawl #will be modified to the format: #ISC_URL=<url>\t<existingtitle> #Where \t is a tab character #This allows programatic access to the file's original URL in search results #returned by the Indexing Service without having to modify Indexing Service #properties (for which administrative access to the server is required). [AddURLToTitle] = no #If you are indexing content from a case sensitive web server then this setting #must be set to 'yes', otherwise set it to 'no'. [CaseSensitiveServer] = no ################################################## # # # Database data extraction configuration options # # # ################################################## #If set to yes, then the HTML <title> tag of all files saved from database rows #will be modified to the format: #ISC_Table=<tablename>\tISC_KeyField=<keyfield>\tISC_RowNumber=<rownumber>\t<existingtitle> #Where \t is a tab character #This allows programatic access to the table's name, primary key field #and row number in search results returned by the Indexing Service without #having to modify Indexing Service properties (for which administrative #access to the server is required). [AddRowToTitle] = no #If set to 'yes' then all of pages produced from the database rows will be refreshed [RefreshAllRows] = no #The following will extract the data from the q_courses Query of the SampleProject Access 2000 database #Each database table uses the following mandatory configuration options: # DSN = connection string # Table = table name # KeyField = primary key field of the table # AllowFields = table fields to retrieve # #Each database table also uses the following optional configuration options: # # TitleField = table field containing the row's title # DescriptionField = table field containing the row's description # #Each database table must have a unique number in this configuration file #For further details, consult the Indexing Service Companion's product documentation #ODBC Connection string for the database containing the specific table. [DataTable_1_DSN] = Provider=Microsoft.Jet.OLEDB.4.0;Data Source=Provider=Microsoft.Jet.OLEDB.4.0;Data Source=SampleProject\SampleProject.mdb;Persist Security Info=False; #Name of the database table. Note that it is possible to specify the database name #and the table name in the format <database>..<table> used by SQL Server and other #relational databases. [DataTable_1_Table] = q_courses #Primary key field for this table. This field must be unique. Composite keys are #not presently supported [DataTable_1_KeyField] = CourseID #A list of database table fields that need to be indexed. A single '*' means that #all of the fields in the table will be indexed. [DataTable_1_AllowFields] = * #An optional configuration option that specifies the field that should be used #for the row's title. This is used in the HTML <title> tag for the row, #and is useful for when creating pages displaying search results. [DataTable_1_TitleField] = CourseName #An optional configuration option that specifies the field that should be used #for the row's description. This is used in the HTML <description> tag for the row, #and is useful for when creating pages displaying search results. [DataTable_1_DescriptionField] = CourseDescription #The following will extract the data from the t_tutors table of the SampleProject Access 2000 database [DataTable_2_DSN] = Provider=Microsoft.Jet.OLEDB.4.0;Data Source=Provider=Microsoft.Jet.OLEDB.4.0;Data Source=SampleProject\SampleProject.mdb;Persist Security Info=False; [DataTable_2_Table] = t_tutors [DataTable_2_KeyField] = TutorID [DataTable_2_AllowFields] = TutorID, TutorName, TutorSubjects, TutorBiography [DataTable_2_TitleField] = TutorName [DataTable_2_DescriptionField] = TutorBiography