This repository has been archived on 2020-05-27. You can view files and clone it, but cannot push or open issues/pull-requests.
weblech/weblech/spider/Spider.java

334 lines
10 KiB
Java
Executable File

/*
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
*
* Copyright (c) 2001 Brian Pitcher
* Copyright (c) 2004 Andrew Coleman
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package weblech.spider;
import weblech.ui.LechLogger;
import java.util.*;
import java.io.*;
import java.net.URL;
public class Spider implements Runnable, Constants
{
/** Config for the spider */
private SpiderConfig config;
/**
* Download queue.
* Thread safety: To access the queue, first synchronize on it.
*/
private DownloadQueue queue;
/**
* Set of URLs downloaded or scheduled, so we don't download a
* URL more than once.
* Thread safety: To access the set, first synchronize on it.
*/
private Set urlsDownloadedOrScheduled;
/**
* Set of URLs currently being downloaded by Spider threads.
* Thread safety: To access the set, first synchronize on it.
*/
private Set urlsDownloading;
/**
* Number of downloads currently taking place.
* Thread safety: To modify this value, first synchronize on
* the download queue.
*/
private int downloadsInProgress;
/** Whether the spider should quit */
private boolean quit;
/** Count of running Spider threads. */
private int running;
/** Time we last checkpointed. */
private long lastCheckpoint;
public Spider(SpiderConfig config)
{
this.config = config;
queue = new DownloadQueue(config);
queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
urlsDownloadedOrScheduled = new HashSet();
urlsDownloading = new HashSet();
downloadsInProgress = 0;
lastCheckpoint = 0;
}
public void start()
{
quit = false;
running = 0;
for(int i = 0; i < config.getSpiderThreads(); i++)
{
LechLogger.info("Starting Spider thread");
Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
t.start();
running++;
}
}
public void stop()
{
quit = true;
}
public boolean isRunning()
{
return running == 0;
}
private void checkpointIfNeeded()
{
if(config.getCheckpointInterval() == 0)
{
return;
}
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
{
synchronized(queue)
{
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
{
writeCheckpoint();
lastCheckpoint = System.currentTimeMillis();
}
}
}
}
private void writeCheckpoint()
{
LechLogger.debug("writeCheckpoint()");
try
{
FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(queue);
oos.writeObject(urlsDownloading);
oos.close();
}
catch(IOException ioe)
{
LechLogger.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
}
}
public void readCheckpoint()
{
try
{
FileInputStream fis = new FileInputStream("spider.checkpoint");
ObjectInputStream ois = new ObjectInputStream(fis);
queue = (DownloadQueue) ois.readObject();
urlsDownloading = (Set) ois.readObject();
queue.queueURLs(urlsDownloading);
urlsDownloading.clear();
}
catch(Exception e)
{
LechLogger.error("Caught exception reading checkpoint: " + e.getMessage(), e);
}
}
public void run()
{
HTMLParser htmlParser = new HTMLParser(config);
URLGetter urlGetter = new URLGetter(config);
while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)
{
checkpointIfNeeded();
if(queueSize() == 0 && downloadsInProgress > 0)
{
// Wait for a download to finish before seeing if this thread should stop
try
{
Thread.sleep(QUEUE_CHECK_INTERVAL);
}
catch(InterruptedException ignored)
{
}
// Have another go at the loop
continue;
}
else if(queueSize() == 0)
{
break;
}
URLToDownload nextURL;
synchronized(queue)
{
nextURL = queue.getNextInQueue();
downloadsInProgress++;
}
synchronized(urlsDownloading)
{
urlsDownloading.add(nextURL);
}
int newDepth = nextURL.getDepth() + 1;
int maxDepth = config.getMaxDepth();
synchronized(urlsDownloading)
{
urlsDownloading.remove(nextURL);
}
List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
newURLs = filterURLs(newURLs);
ArrayList u2dsToQueue = new ArrayList();
for(Iterator i = newURLs.iterator(); i.hasNext(); )
{
URL u = (URL) i.next();
// Download if not yet downloaded, and the new depth is less than the maximum
synchronized(urlsDownloadedOrScheduled)
{
if(!urlsDownloadedOrScheduled.contains(u)
&& (maxDepth == 0 || newDepth <= maxDepth))
{
u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));
urlsDownloadedOrScheduled.add(u);
}
}
}
synchronized(queue)
{
queue.queueURLs(u2dsToQueue);
downloadsInProgress--;
}
}
LechLogger.info("Spider thread stopping [" + config.getStartLocation() + "]" );
running--;
}
/**
* Get the size of the download queue in a thread-safe manner.
*/
private int queueSize()
{
synchronized(queue)
{
return queue.size();
}
}
/**
* Get a URL, and return new URLs that are referenced from it.
*
* @return A List of URL objects.
*/
private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)
{
LechLogger.debug("downloadURL(" + url + ")");
// Bail out early if image and already on disk
URLObject obj = new URLObject(url.getURL(), config);
if(obj.existsOnDisk())
{
if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))
{
LechLogger.info("Q: [" + queue + "] " + url);
obj = urlGetter.getURL(url);
}
else if(config.refreshImages() && obj.isImage())
{
LechLogger.info("Q: [" + queue + "] " + url);
obj = urlGetter.getURL(url);
}
}
else
{
LechLogger.info("Q: [" + queue + "] " + url);
obj = urlGetter.getURL(url);
}
if(obj == null)
{
return new ArrayList();
}
if(!obj.existsOnDisk())
{
obj.writeToFile();
}
if(obj.isHTML() || obj.isXML())
{
return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());
}
else if(obj.isImage())
{
return new ArrayList();
}
else
{
LechLogger.warn("Unknown content type received: " + obj.getContentType());
LechLogger.info("URL was " + url);
return new ArrayList();
}
}
private List filterURLs(List URLs)
{
String match = config.getURLMatch();
ArrayList retVal = new ArrayList();
synchronized(urlsDownloadedOrScheduled)
{
for(Iterator i = URLs.iterator(); i.hasNext(); )
{
URL u = (URL) i.next();
if(urlsDownloadedOrScheduled.contains(u))
{
continue;
}
String s = u.toString();
if(s.indexOf(match) != -1)
{
retVal.add(u);
}
}
}
return retVal;
}
/* Method By Coleman
* A basic check to see if there is another spider downloading the same thing
*/
protected boolean compareSpiderConfig ( SpiderConfig sc ) {
return config.getStartLocation().equals ( sc.getStartLocation() );
}
/* Method By Coleman
* A method to determine if one spider is downloading the same file as another spider
*/
public boolean equals ( Object o ) {
if ( !o.getClass().isInstance ( this ) ) return false;
return ((Spider) o).compareSpiderConfig ( config );
}
}