From 80c48012cd2dcaacfa9de8970abd5bc1b4002780 Mon Sep 17 00:00:00 2001 From: mercury Date: Sun, 7 Mar 2004 20:51:05 +0000 Subject: [PATCH] Initial revision --- LICENSE.txt | 26 ++ build.xml | 31 ++ weblech/spider/Constants.java | 18 + weblech/spider/DownloadQueue.java | 143 ++++++++ weblech/spider/DumbAuthenticator.java | 50 +++ weblech/spider/HTMLParser.java | 188 +++++++++++ weblech/spider/Spider.java | 333 ++++++++++++++++++ weblech/spider/SpiderConfig.java | 464 ++++++++++++++++++++++++++ weblech/spider/URLGetter.java | 138 ++++++++ weblech/spider/URLObject.java | 206 ++++++++++++ weblech/spider/URLToDownload.java | 68 ++++ weblech/ui/LechLogger.java | 138 ++++++++ weblech/ui/SpiderConfigPanel.java | 251 ++++++++++++++ weblech/ui/Troll.java | 113 +++++++ 14 files changed, 2167 insertions(+) create mode 100755 LICENSE.txt create mode 100755 build.xml create mode 100755 weblech/spider/Constants.java create mode 100755 weblech/spider/DownloadQueue.java create mode 100755 weblech/spider/DumbAuthenticator.java create mode 100755 weblech/spider/HTMLParser.java create mode 100755 weblech/spider/Spider.java create mode 100755 weblech/spider/SpiderConfig.java create mode 100755 weblech/spider/URLGetter.java create mode 100755 weblech/spider/URLObject.java create mode 100755 weblech/spider/URLToDownload.java create mode 100755 weblech/ui/LechLogger.java create mode 100755 weblech/ui/SpiderConfigPanel.java create mode 100755 weblech/ui/Troll.java diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100755 index 0000000..d0ee58a --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,26 @@ +WebLech license information. +============================ + + This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + + Copyright (c) 2001 Brian Pitcher + Copyright (c) 2004 Andrew Coleman + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + diff --git a/build.xml b/build.xml new file mode 100755 index 0000000..5891420 --- /dev/null +++ b/build.xml @@ -0,0 +1,31 @@ + + + WebLech - a tool for downloading the web + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/weblech/spider/Constants.java b/weblech/spider/Constants.java new file mode 100755 index 0000000..b90b6b7 --- /dev/null +++ b/weblech/spider/Constants.java @@ -0,0 +1,18 @@ +/* + * Created by IntelliJ IDEA. + * User: Michael Mason + * Date: Jun 5, 2002 + * Time: 6:43:04 PM + * To change template for new interface use + * Code Style | Class Templates options (Tools | IDE Options). + */ +package weblech.spider; + +public interface Constants +{ + + /** How often to check the queue status */ + int QUEUE_CHECK_INTERVAL = 500; + /** How long to pause for threads to finish before exitting */ + int SPIDER_STOP_PAUSE = 500; +} diff --git a/weblech/spider/DownloadQueue.java b/weblech/spider/DownloadQueue.java new file mode 100755 index 0000000..5d7ae84 --- /dev/null +++ b/weblech/spider/DownloadQueue.java @@ -0,0 +1,143 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/DownloadQueue.java,v 1.1 2004/03/07 20:51:05 mercury Exp $ + +package weblech.spider; + +import java.util.*; +import java.net.URL; +import java.io.Serializable; + +public class DownloadQueue implements Serializable +{ + private SpiderConfig config; + + private List interestingURLsToDownload; + private List averageURLsToDownload; + private List boringURLsToDownload; + private Set urlsInQueue; + + public DownloadQueue(SpiderConfig config) + { + this.config = config; + interestingURLsToDownload = new ArrayList(); + averageURLsToDownload = new ArrayList(); + boringURLsToDownload = new ArrayList(); + urlsInQueue = new HashSet(); + } + + public void queueURL(URLToDownload url) + { + URL u = url.getURL(); + if(urlsInQueue.contains(u)) + { + return; + } + + if(config.isInteresting(u)) + { + if(config.isDepthFirstSearch()) + { + interestingURLsToDownload.add(0, url); + } + else + { + interestingURLsToDownload.add(url); + } + } + else if(config.isBoring(u)) + { + if(config.isDepthFirstSearch()) + { + boringURLsToDownload.add(0, url); + } + else + { + boringURLsToDownload.add(url); + } + } + else + { + if(config.isDepthFirstSearch()) + { + averageURLsToDownload.add(0, url); + } + else + { + averageURLsToDownload.add(url); + } + } + + urlsInQueue.add(u); + } + + public void queueURLs(Collection urls) + { + for(Iterator i = urls.iterator(); i.hasNext(); ) + { + URLToDownload u2d = (URLToDownload) i.next(); + queueURL(u2d); + } + } + + public URLToDownload getNextInQueue() + { + if(interestingURLsToDownload.size() > 0) + { + return returnURLFrom(interestingURLsToDownload); + } + else if(averageURLsToDownload.size() > 0) + { + return returnURLFrom(averageURLsToDownload); + } + else if(boringURLsToDownload.size() > 0) + { + return returnURLFrom(boringURLsToDownload); + } + else + { + return null; + } + } + + private URLToDownload returnURLFrom(List urlList) + { + URLToDownload u2d = (URLToDownload) urlList.get(0); + urlList.remove(0); + urlsInQueue.remove(u2d.getURL()); + return u2d; + } + + public int size() + { + return interestingURLsToDownload.size() + averageURLsToDownload.size() + boringURLsToDownload.size(); + } + + public String toString() + { + return size() + " URLs"; + } + +} // End class DownloadQueue diff --git a/weblech/spider/DumbAuthenticator.java b/weblech/spider/DumbAuthenticator.java new file mode 100755 index 0000000..5760306 --- /dev/null +++ b/weblech/spider/DumbAuthenticator.java @@ -0,0 +1,50 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * Copyright (c) 2004 Andrew Coleman + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package weblech.spider; + +import java.net.Authenticator; +import java.net.PasswordAuthentication; + +import weblech.ui.LechLogger; + +public class DumbAuthenticator extends Authenticator +{ + private final String user; + private final String password; + + public DumbAuthenticator(String user, String password) + { + LechLogger.debug("DumbAuthenticator(" + user + ", ***)"); + this.user = user; + this.password = password; + } + + public PasswordAuthentication getPasswordAuthentication() + { + LechLogger.debug("getPasswordAuthentication()"); + return new PasswordAuthentication(user, password.toCharArray()); + } +} diff --git a/weblech/spider/HTMLParser.java b/weblech/spider/HTMLParser.java new file mode 100755 index 0000000..865fba0 --- /dev/null +++ b/weblech/spider/HTMLParser.java @@ -0,0 +1,188 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * Copyright (c) 2004 Andrew Coleman + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package weblech.spider; + +import java.util.List; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; +import java.net.URL; +import java.net.MalformedURLException; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.FileWriter; +import java.io.PrintWriter; + +import weblech.ui.LechLogger; + +public class HTMLParser +{ + private SpiderConfig config; + + public HTMLParser(SpiderConfig config) + { + this.config = config; + } + + public List parseLinksInDocument(URL sourceURL, String textContent) + { + return parseAsHTML(sourceURL, textContent); + } + + private List parseAsHTML(URL sourceURL, String textContent) + { + LechLogger.debug("parseAsHTML()"); + ArrayList newURLs = new ArrayList(); + HashSet newURLSet = new HashSet(); + + /* note from coleman: + * I had to add a few tags into this, namely the link and embeds. weblech should download flash + * movies, mpegs, avis, and anything else that it finds on the page. even stylesheets :) + */ + extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("link", "href", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("embed", "src", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("LINK", "HREF", sourceURL, newURLs, newURLSet, textContent); + extractAttributesFromTags("EMBED", "SRC", sourceURL, newURLs, newURLSet, textContent); + + if(newURLs.size() == 0) + { + LechLogger.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent); + } + LechLogger.debug("Returning " + newURLs.size() + " urls extracted from page"); + return newURLs; + } + + private void extractAttributesFromTags(String tag, String attr, URL sourceURL, List newURLs, Set newURLSet, String input) + { + LechLogger.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)"); + + int startPos = 0; + String startTag = "<" + tag + " "; + String attrStr = attr + "=\""; + while(true) + { + int tagPos = input.indexOf(startTag, startPos); + if(tagPos < 0) + { + return; + } + int attrPos = input.indexOf(attrStr, tagPos + 1); + if(attrPos < 0) + { + startPos = tagPos + 1; + continue; + } + int nextClosePos = input.indexOf(">", tagPos + 1); + if(attrPos < nextClosePos) + { + // Ooh, found one + int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1); + if(closeQuotePos > 0) + { + String urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos); + if(urlStr.indexOf('#') != -1) + { + urlStr = urlStr.substring(0, urlStr.indexOf('#')); + } + //LechLogger.debug("Found possible URL string: " + URL); + + if(isMailTo(urlStr)) + { + logMailURL(urlStr); + } + else + { + try + { + + URL u = new URL(sourceURL, urlStr); + if(newURLSet.contains(u)) + { + //LechLogger.debug("Already found URL on page: " + u); + } + else + { + newURLs.add(u); + newURLSet.add(u); + //LechLogger.debug("Found new URL on page: " + u); + } + } + catch(MalformedURLException murle) + { + } + } + } + startPos = tagPos + 1; + continue; + } + else + { + startPos = tagPos + 1; + continue; + } + } + } + + private void logMailURL(String url) + { + LechLogger.debug("logMailURL()"); + + try + { + FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true); + PrintWriter pW = new PrintWriter(appendedFile); + pW.println(url); + pW.flush(); + pW.close(); + } + catch(IOException ioe) + { + LechLogger.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe); + } + } + + /** + * Check if a particular URL looks like it's a mailto: style link. + */ + private boolean isMailTo(String url) + { + if(url == null) + { + return false; + } + + url = url.toUpperCase(); + return (url.indexOf("MAILTO:") != -1); + } +} diff --git a/weblech/spider/Spider.java b/weblech/spider/Spider.java new file mode 100755 index 0000000..0eff69a --- /dev/null +++ b/weblech/spider/Spider.java @@ -0,0 +1,333 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * Copyright (c) 2004 Andrew Coleman + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package weblech.spider; + +import weblech.ui.LechLogger; + +import java.util.*; +import java.io.*; +import java.net.URL; + +public class Spider implements Runnable, Constants +{ + /** Config for the spider */ + private SpiderConfig config; + /** + * Download queue. + * Thread safety: To access the queue, first synchronize on it. + */ + private DownloadQueue queue; + /** + * Set of URLs downloaded or scheduled, so we don't download a + * URL more than once. + * Thread safety: To access the set, first synchronize on it. + */ + private Set urlsDownloadedOrScheduled; + /** + * Set of URLs currently being downloaded by Spider threads. + * Thread safety: To access the set, first synchronize on it. + */ + private Set urlsDownloading; + /** + * Number of downloads currently taking place. + * Thread safety: To modify this value, first synchronize on + * the download queue. + */ + private int downloadsInProgress; + /** Whether the spider should quit */ + private boolean quit; + /** Count of running Spider threads. */ + private int running; + /** Time we last checkpointed. */ + private long lastCheckpoint; + + public Spider(SpiderConfig config) + { + this.config = config; + queue = new DownloadQueue(config); + queue.queueURL(new URLToDownload(config.getStartLocation(), 0)); + urlsDownloadedOrScheduled = new HashSet(); + urlsDownloading = new HashSet(); + downloadsInProgress = 0; + lastCheckpoint = 0; + } + + public void start() + { + quit = false; + running = 0; + + for(int i = 0; i < config.getSpiderThreads(); i++) + { + LechLogger.info("Starting Spider thread"); + Thread t = new Thread(this, "Spider-Thread-" + (i + 1)); + t.start(); + running++; + } + } + + public void stop() + { + quit = true; + } + + public boolean isRunning() + { + return running == 0; + } + + private void checkpointIfNeeded() + { + if(config.getCheckpointInterval() == 0) + { + return; + } + + if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval()) + { + synchronized(queue) + { + if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval()) + { + writeCheckpoint(); + lastCheckpoint = System.currentTimeMillis(); + } + } + } + } + + private void writeCheckpoint() + { + LechLogger.debug("writeCheckpoint()"); + try + { + FileOutputStream fos = new FileOutputStream("spider.checkpoint", false); + ObjectOutputStream oos = new ObjectOutputStream(fos); + oos.writeObject(queue); + oos.writeObject(urlsDownloading); + oos.close(); + } + catch(IOException ioe) + { + LechLogger.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe); + } + } + + public void readCheckpoint() + { + try + { + FileInputStream fis = new FileInputStream("spider.checkpoint"); + ObjectInputStream ois = new ObjectInputStream(fis); + queue = (DownloadQueue) ois.readObject(); + urlsDownloading = (Set) ois.readObject(); + queue.queueURLs(urlsDownloading); + urlsDownloading.clear(); + } + catch(Exception e) + { + LechLogger.error("Caught exception reading checkpoint: " + e.getMessage(), e); + } + } + + public void run() + { + HTMLParser htmlParser = new HTMLParser(config); + URLGetter urlGetter = new URLGetter(config); + + while((queueSize() > 0 || downloadsInProgress > 0) && quit == false) + { + checkpointIfNeeded(); + if(queueSize() == 0 && downloadsInProgress > 0) + { + // Wait for a download to finish before seeing if this thread should stop + try + { + Thread.sleep(QUEUE_CHECK_INTERVAL); + } + catch(InterruptedException ignored) + { + } + // Have another go at the loop + continue; + } + else if(queueSize() == 0) + { + break; + } + URLToDownload nextURL; + synchronized(queue) + { + nextURL = queue.getNextInQueue(); + downloadsInProgress++; + } + synchronized(urlsDownloading) + { + urlsDownloading.add(nextURL); + } + int newDepth = nextURL.getDepth() + 1; + int maxDepth = config.getMaxDepth(); + synchronized(urlsDownloading) + { + urlsDownloading.remove(nextURL); + } + List newURLs = downloadURL(nextURL, urlGetter, htmlParser); + + newURLs = filterURLs(newURLs); + + ArrayList u2dsToQueue = new ArrayList(); + for(Iterator i = newURLs.iterator(); i.hasNext(); ) + { + URL u = (URL) i.next(); + // Download if not yet downloaded, and the new depth is less than the maximum + synchronized(urlsDownloadedOrScheduled) + { + if(!urlsDownloadedOrScheduled.contains(u) + && (maxDepth == 0 || newDepth <= maxDepth)) + { + u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth)); + urlsDownloadedOrScheduled.add(u); + } + } + } + synchronized(queue) + { + queue.queueURLs(u2dsToQueue); + downloadsInProgress--; + } + } + LechLogger.info("Spider thread stopping [" + config.getStartLocation() + "]" ); + running--; + } + + /** + * Get the size of the download queue in a thread-safe manner. + */ + private int queueSize() + { + synchronized(queue) + { + return queue.size(); + } + } + + /** + * Get a URL, and return new URLs that are referenced from it. + * + * @return A List of URL objects. + */ + private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser) + { + LechLogger.debug("downloadURL(" + url + ")"); + + // Bail out early if image and already on disk + URLObject obj = new URLObject(url.getURL(), config); + if(obj.existsOnDisk()) + { + if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML())) + { + LechLogger.info("Q: [" + queue + "] " + url); + obj = urlGetter.getURL(url); + } + else if(config.refreshImages() && obj.isImage()) + { + LechLogger.info("Q: [" + queue + "] " + url); + obj = urlGetter.getURL(url); + } + } + else + { + LechLogger.info("Q: [" + queue + "] " + url); + obj = urlGetter.getURL(url); + } + + if(obj == null) + { + return new ArrayList(); + } + + if(!obj.existsOnDisk()) + { + obj.writeToFile(); + } + + if(obj.isHTML() || obj.isXML()) + { + return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent()); + } + else if(obj.isImage()) + { + return new ArrayList(); + } + else + { + LechLogger.warn("Unknown content type received: " + obj.getContentType()); + LechLogger.info("URL was " + url); + return new ArrayList(); + } + } + + private List filterURLs(List URLs) + { + String match = config.getURLMatch(); + ArrayList retVal = new ArrayList(); + + synchronized(urlsDownloadedOrScheduled) + { + for(Iterator i = URLs.iterator(); i.hasNext(); ) + { + URL u = (URL) i.next(); + if(urlsDownloadedOrScheduled.contains(u)) + { + continue; + } + + String s = u.toString(); + if(s.indexOf(match) != -1) + { + retVal.add(u); + } + } + } + return retVal; + } + + /* Method By Coleman + * A basic check to see if there is another spider downloading the same thing + */ + protected boolean compareSpiderConfig ( SpiderConfig sc ) { + return config.getStartLocation().equals ( sc.getStartLocation() ); + } + + /* Method By Coleman + * A method to determine if one spider is downloading the same file as another spider + */ + public boolean equals ( Object o ) { + if ( !o.getClass().isInstance ( this ) ) return false; + return ((Spider) o).compareSpiderConfig ( config ); + } + +} diff --git a/weblech/spider/SpiderConfig.java b/weblech/spider/SpiderConfig.java new file mode 100755 index 0000000..570212c --- /dev/null +++ b/weblech/spider/SpiderConfig.java @@ -0,0 +1,464 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * Copyright (c) 2004 Andrew Coleman + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package weblech.spider; + +import weblech.ui.LechLogger; + +import java.io.File; +import java.io.Serializable; +import java.util.*; +import java.net.URL; +import java.net.MalformedURLException; + +public class SpiderConfig implements Serializable +{ + private File saveRootDirectory; + private File mailtoLogFile; + + private boolean refreshHTMLs; + private boolean refreshImages; + private boolean refreshOthers; + + private Set htmlExtensions; + private Set imageExtensions; + + private URL startLocation; + private String urlMatch; + + private List interestingURLSubstrings; + private List boringURLSubstrings; + + private boolean depthFirst; + private int maxDepth; + + private String userAgent; + + private String basicAuthUser; + private String basicAuthPassword; + + private int spiderThreads; + + private long checkpointInterval; + + /** + * Create a default config. + */ + public SpiderConfig() + { + LechLogger.debug("SpiderConfig()"); + + saveRootDirectory = new File("."); + mailtoLogFile = new File("mailto.txt"); + + refreshHTMLs = true; + refreshImages = false; + refreshOthers = false; + + htmlExtensions = new HashSet(); + htmlExtensions.add("htm"); + htmlExtensions.add("html"); + htmlExtensions.add("shtml"); + + imageExtensions = new HashSet(); + imageExtensions.add("jpg"); + imageExtensions.add("gif"); + imageExtensions.add("png"); + /* Added a few image extensions -- Coleman */ + imageExtensions.add("tiff"); + imageExtensions.add("bmp"); + + urlMatch = null; + interestingURLSubstrings = new ArrayList(); + boringURLSubstrings = new ArrayList(); + depthFirst = false; + maxDepth = 0; + + userAgent = "WebLech Spider [Release C]"; + basicAuthUser = ""; + basicAuthPassword = ""; + + spiderThreads = 1; + + checkpointInterval = 0; + } + + /** + * Create a config from a java.util.Properties object. + */ + public SpiderConfig(Properties props) + { + LechLogger.debug("SpiderConfig(props)"); + + saveRootDirectory = new File(props.getProperty("saveRootDirectory", ".")); + if(!saveRootDirectory.exists()) + { + if(!saveRootDirectory.mkdirs()) + { + LechLogger.error("Couldn't create root directory: " + saveRootDirectory); + LechLogger.info("Defaulting to . instead"); + saveRootDirectory = new File("."); + } + } + else if(!saveRootDirectory.isDirectory()) + { + LechLogger.error("Save root is not a directory: " + saveRootDirectory); + LechLogger.info("Defaulting to . instead"); + saveRootDirectory = new File("."); + } + + String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt"); + // Check if absolute or relative name given + if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\")) + { + LechLogger.debug("Using absolute file name " + mailtoFileStr); + mailtoLogFile = new File(mailtoFileStr); + } + else + { + LechLogger.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr); + mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr); + } + + refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue(); + refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue(); + refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue(); + + htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml")); + imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png")); + + String startLocStr = props.getProperty("startLocation"); + if(startLocStr != null) + { + try + { + startLocation = new URL(startLocStr); + } + catch(MalformedURLException murle) + { + LechLogger.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle); + } + } + else + { + LechLogger.warn("startLocation not found in properties"); + } + + urlMatch = props.getProperty("urlMatch"); + + interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs")); + boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs")); + + depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue(); + try + { + String maxDepthStr = props.getProperty("maxDepth", "0"); + maxDepth = Integer.parseInt(maxDepthStr); + } + catch(NumberFormatException nfe) + { + LechLogger.error("Caught number format exception parsing max depth, defaulting to 1", nfe); + maxDepth = 1; + } + + userAgent = props.getProperty("userAgent", "WebLech Spider [Version C]"); + basicAuthUser = props.getProperty("basicAuthUser", ""); + basicAuthPassword = props.getProperty("basicAuthPassword", ""); + + try + { + String threadsStr = props.getProperty("spiderThreads", "1"); + spiderThreads = Integer.parseInt(threadsStr); + } + catch(NumberFormatException nfe) + { + LechLogger.error("Caught number format exception parsing number of threads, defaulting to 1", nfe); + spiderThreads = 1; + } + + try + { + String intervalStr = props.getProperty("checkpointInterval", "0"); + checkpointInterval = Long.parseLong(intervalStr); + } + catch(NumberFormatException nfe) + { + LechLogger.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe); + spiderThreads = 1; + } + } + + private List parsePropCommaSeparated(String str) + { + ArrayList result = new ArrayList(); + if(str != null && str.length() > 0) + { + StringTokenizer tok = new StringTokenizer(str, ","); + while(tok.hasMoreTokens()) + { + result.add(tok.nextToken()); + } + } + return result; + } + + + public void setRefreshHTMLs(boolean refreshHTMLs) + { + this.refreshHTMLs = refreshHTMLs; + } + + public boolean refreshHTMLs() + { + return refreshHTMLs; + } + + public void setRefreshImages(boolean refreshImages) + { + this.refreshImages = refreshImages; + } + + public boolean refreshImages() + { + return refreshImages; + } + + public void setRefreshOthers(boolean refreshOthers) + { + this.refreshOthers = refreshOthers; + } + + public boolean refreshOthers() + { + return refreshOthers; + } + + public void setSaveRootDirectory(File saveRootDirectory) + { + this.saveRootDirectory = saveRootDirectory; + } + + public File getSaveRootDirectory() + { + return saveRootDirectory; + } + + public void setMailtoLogFile(File mailtoLogFile) + { + this.mailtoLogFile = mailtoLogFile; + } + + public File getMailtoLogFile() + { + return mailtoLogFile; + } + + public void setStartLocation(URL startLocation) + { + this.startLocation = startLocation; + } + + public URL getStartLocation() + { + return startLocation; + } + + public void setURLMatch(String urlMatch) + { + this.urlMatch = urlMatch; + } + + public String getURLMatch() + { + return urlMatch; + } + + public List getInterestingURLSubstrings() + { + return interestingURLSubstrings; + } + + public void setInterestingURLSubstrings(List interestingURLSubstrings) + { + this.interestingURLSubstrings = interestingURLSubstrings; + } + + public List getBoringURLSubstrings() + { + return boringURLSubstrings; + } + + public void setBoringURLSubstrings(List boringURLSubstrings) + { + this.boringURLSubstrings = boringURLSubstrings; + } + + public boolean isInteresting(URL u) + { + return matchURL(u, interestingURLSubstrings); + } + + public boolean isBoring(URL u) + { + return matchURL(u, boringURLSubstrings); + } + + private boolean matchURL(URL u, List substrings) + { + String str = u.toExternalForm(); + for(Iterator i = substrings.iterator(); i.hasNext(); ) + { + String substr = (String) i.next(); + if(str.indexOf(substr) != -1) + { + return true; + } + } + return false; + } + + public void setDepthFirstSearch(boolean depthFirst) + { + this.depthFirst = depthFirst; + } + + public boolean isDepthFirstSearch() + { + return depthFirst; + } + + public void setMaxDepth(int maxDepth) + { + this.maxDepth = maxDepth; + } + + public int getMaxDepth() + { + return maxDepth; + } + + public void setUserAgent(String userAgent) + { + this.userAgent = userAgent; + } + + public String getUserAgent() + { + return userAgent; + } + + public void setBasicAuthUser(String basicAuthUser) + { + this.basicAuthUser = basicAuthUser; + } + + public String getBasicAuthUser() + { + return basicAuthUser; + } + + public void setBasicAuthPassword(String basicAuthPassword) + { + this.basicAuthPassword = basicAuthPassword; + } + + public String getBasicAuthPassword() + { + return basicAuthPassword; + } + + public void setSpiderThreads(int spiderThreads) + { + this.spiderThreads = spiderThreads; + } + + public int getSpiderThreads() + { + return spiderThreads; + } + + public void setCheckpointInterval(long interval) + { + this.checkpointInterval = interval; + } + + public long getCheckpointInterval() + { + return checkpointInterval; + } + + public String toString() + { + return "depthFirst:\t" + depthFirst + + "\nmaxDepth:\t" + maxDepth + + "\nhtmlExtensions:\t" + fromSet(htmlExtensions) + + "\nimageExtensions:\t" + fromSet(imageExtensions) + + "\nrefreshHTMLs:\t" + refreshHTMLs + + "\nrefreshImages:\t" + refreshImages + + "\nrefreshOthers:\t" + refreshOthers + + "\nsaveRootDirectory:\t" + saveRootDirectory + + "\nstartLocation:\t" + startLocation + + "\nurlMatch:\t" + urlMatch + + "\nuserAgent:\t" + userAgent + + "\nbasicAuthUser:\t" + basicAuthUser + + "\nbasicAuthPassword:\t" + "***" + + "\nspiderThreads:\t" + spiderThreads + + "\ncheckpointInterval:\t" + checkpointInterval; + } + + private Set parseSet(String str) + { + LechLogger.debug("parseSet(" + str + ")"); + HashSet result = new HashSet(); + StringTokenizer sTok = new StringTokenizer(str, ","); + while(sTok.hasMoreTokens()) + { + String tok = sTok.nextToken().trim(); + result.add(tok); + } + return result; + } + + private String fromSet(Set s) + { + StringBuffer sb = new StringBuffer(); + boolean first = true; + for(Iterator i = s.iterator(); i.hasNext(); ) + { + String str = (String) i.next(); + if(first) + { + first = false; + } + else + { + sb.append(","); + } + sb.append(str); + } + return sb.toString(); + } + +} // End class SpiderConfig diff --git a/weblech/spider/URLGetter.java b/weblech/spider/URLGetter.java new file mode 100755 index 0000000..20fcfdd --- /dev/null +++ b/weblech/spider/URLGetter.java @@ -0,0 +1,138 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * Copyright (c) 2004 Andrew Coleman + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package weblech.spider; + +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.Authenticator; +import java.io.*; + +import weblech.ui.LechLogger; + +public class URLGetter +{ + private int failureCount = 0; + + private final SpiderConfig config; + + public URLGetter(SpiderConfig config) + { + LechLogger.debug("URLGetter()"); + this.config = config; + + Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword())); + } + + public URLObject getURL(URLToDownload url) + { + LechLogger.debug("getURL(" + url + ")"); + + if(failureCount > 10) + { + LechLogger.warn("Lots of failures recently, waiting 5 seconds before attempting download"); + try { Thread.sleep(5 * 1000); } catch(InterruptedException e) { }; + failureCount = 0; + } + + URL requestedURL = url.getURL(); + URL referer = url.getReferer(); + + try + { + LechLogger.debug("Creating HTTP connection to " + requestedURL); + HttpURLConnection conn = (HttpURLConnection) requestedURL.openConnection(); + if(referer != null) + { + LechLogger.debug("Setting Referer header to " + referer); + conn.setRequestProperty("Referer", referer.toExternalForm()); + } + + if(config.getUserAgent() != null) + { + LechLogger.debug("Setting User-Agent to " + config.getUserAgent()); + conn.setRequestProperty("User-Agent", config.getUserAgent()); + } + + conn.setUseCaches(false); + + LechLogger.debug("Opening URL"); + long startTime = System.currentTimeMillis(); + conn.connect(); + + String resp = conn.getResponseMessage(); + LechLogger.debug("Remote server response: " + resp); + + String respStr = conn.getHeaderField(0); + LechLogger.info("Server response: " + respStr); + + for(int i = 1; ; i++) + { + String key = conn.getHeaderFieldKey(i); + if(key == null) + { + break; + } + String value = conn.getHeaderField(key); + LechLogger.debug("Received header " + key + ": " + value); + } + + LechLogger.debug("Getting buffered input stream from remote connection"); + BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(10240); + byte[] buf = new byte[1024]; + int bytesRead = 0; + while(bytesRead >= 0) + { + baos.write(buf, 0, bytesRead); + bytesRead = remoteBIS.read(buf); + } + + byte[] content = baos.toByteArray(); + long timeTaken = System.currentTimeMillis() - startTime; + if(timeTaken < 100) timeTaken = 500; + + int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0)); + LechLogger.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec"); + if(content.length < conn.getContentLength()) + { + LechLogger.warn("Didn't download full content for URL: " + url); + failureCount++; + return null; + } + return new URLObject(requestedURL, conn.getContentType(), content, config); + } + catch(FileNotFoundException fnfe) { + LechLogger.warn("File not found: " + fnfe.getMessage()); + return null; + } + catch(IOException ioe) + { + LechLogger.warn("Caught IO Exception: " + ioe.getMessage(), ioe); + failureCount++; + return null; + } + } +} diff --git a/weblech/spider/URLObject.java b/weblech/spider/URLObject.java new file mode 100755 index 0000000..00e3670 --- /dev/null +++ b/weblech/spider/URLObject.java @@ -0,0 +1,206 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * Copyright (c) 2004 Andrew Coleman + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package weblech.spider; + +import java.io.*; +import java.net.URL; +import java.net.URLEncoder; + +import weblech.ui.LechLogger; + +public class URLObject +{ + private final URL sourceURL; + private final String contentType; + private final byte[] content; + + private final SpiderConfig config; + + public URLObject(URL sourceURL, String contentType, byte[] content, SpiderConfig config) + { + this.sourceURL = sourceURL; + this.contentType = contentType; + this.content = content; + this.config = config; + } + + public URLObject(URL sourceURL, SpiderConfig config) + { + this.sourceURL = sourceURL; + this.config = config; + + String s = sourceURL.toExternalForm().toLowerCase(); + if(s.indexOf(".jpg") != -1) + { + contentType = "image/jpeg"; + } + else if(s.indexOf(".gif") != -1) + { + contentType = "image/gif"; + } + else + { + contentType = "text/html"; + } + + if(existsOnDisk()) + { + + File f = new File(convertToFileName()); + if(f.isDirectory()) + { + f = new File(f, "index.html"); + } + content = new byte[(int) f.length()]; + try + { + FileInputStream in = new FileInputStream(f); + in.read(content); + in.close(); + } + catch(IOException ioe) + { + LechLogger.warn("IO Exception reading disk version of URL " + sourceURL, ioe); + } + } + else + { + content = new byte[0]; + } + } + + public String getContentType() + { + return contentType; + } + + public boolean isHTML() + { + return contentType.toLowerCase().startsWith("text/html"); + } + + public boolean isXML() + { + return contentType.toLowerCase().startsWith("text/xml"); + } + + public boolean isImage() + { + return contentType.startsWith("image/"); + } + + public String getStringContent() + { + return new String(content); + } + + private String convertToFileName() + { + String url = sourceURL.toExternalForm(); + int httpIdx = url.indexOf("http://"); + if(httpIdx == 0) + { + url = url.substring(7); + } + // Check for at least one slash -- otherwise host name (e.g. sourceforge.net) + if(url.indexOf("/") < 0) + { + url = url + "/"; + } + // If trailing slash, add index.html as default + if(url.endsWith("/")) + { + url = url + "index.html"; + } + try { + /* the old encode method is now deprecated, updated to the new API -- Coleman */ + url = textReplace("?", URLEncoder.encode("?","UTF-8"), url); + url = textReplace("&", URLEncoder.encode("&","UTF-8"), url); + } + catch ( java.io.UnsupportedEncodingException exception ) { + LechLogger.error ( exception.toString() ); + } + return config.getSaveRootDirectory().getPath() + "/" + url; + } + + public boolean existsOnDisk() + { + File f = new File(convertToFileName()); + return (f.exists() && !f.isDirectory()); + } + + public void writeToFile() + { + writeToFile(convertToFileName()); + } + + public void writeToFile(String fileName) + { + LechLogger.debug("writeToFile(" + fileName + ")"); + try + { + File f = new File(fileName); + f.getParentFile().mkdirs(); + FileOutputStream out = new FileOutputStream(fileName); + out.write(content); + out.flush(); + out.close(); + } + catch(IOException ioe) + { + LechLogger.warn("IO Exception writing to " + fileName, ioe); + } + } + + public String toString() + { + StringBuffer sb = new StringBuffer(); + sb.append("URLObject: "); + sb.append(contentType); + if(false)//isHTML() || isXML()) + { + sb.append("\n"); + sb.append(getStringContent()); + } + return sb.toString(); + } + + private String textReplace(String find, String replace, String input) + { + int startPos = 0; + while(true) + { + int textPos = input.indexOf(find, startPos); + if(textPos < 0) + { + break; + } + input = input.substring(0, textPos) + replace + input.substring(textPos + find.length()); + startPos = textPos + replace.length(); + } + return input; + } +} diff --git a/weblech/spider/URLToDownload.java b/weblech/spider/URLToDownload.java new file mode 100755 index 0000000..61fa44d --- /dev/null +++ b/weblech/spider/URLToDownload.java @@ -0,0 +1,68 @@ +/* + * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html + * + * Copyright (c) 2001 Brian Pitcher + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/URLToDownload.java,v 1.1 2004/03/07 20:51:05 mercury Exp $ + +package weblech.spider; + +import java.net.URL; + +public class URLToDownload implements java.io.Serializable +{ + private final URL url; + private final URL referer; + private final int depth; + + public URLToDownload(URL url, int depth) + { + this(url, null, depth); + } + + public URLToDownload(URL url, URL referer, int depth) + { + this.url = url; + this.referer = referer; + this.depth = depth; + } + + public URL getURL() + { + return url; + } + + public URL getReferer() + { + return referer; + } + + public int getDepth() + { + return depth; + } + + public String toString() + { + return url + ", referer " + referer + ", depth " + depth; + } +} diff --git a/weblech/ui/LechLogger.java b/weblech/ui/LechLogger.java new file mode 100755 index 0000000..240a269 --- /dev/null +++ b/weblech/ui/LechLogger.java @@ -0,0 +1,138 @@ +/** + * LechLogger.java: A Graphical Logger + * The original weblech had a simple text interface and used an apache.org library for logging. + * When i wanted to make this thing into a graphical appk, I realized the text logging + * was going to have to go. It assumes you only want to log to one location (not a problem for + * this application). + */ +package weblech.ui; + +import javax.swing.JTextArea; +import java.io.IOException; + +public class LechLogger { + + /** + * The actual text area that will perform all output. + */ + private static JTextArea _loggerWindow; + /** + * These are flags for enabling different types of logging mechanisms. + */ + private static boolean error_enable, warn_enable, info_enable, debug_enable; + + /** + * Only need one initialization since this will be shared between many different + * objects. + */ + static { + _loggerWindow = null; + error_enable = true; + warn_enable = true; + info_enable = true; + debug_enable = true; + } + + /** Everybody wants to log, but you only need one logger! */ + public LechLogger() { + } + + /** + * Sets the textual component to perform the logging. + */ + public static void setTextArea ( JTextArea textarea ) { + _loggerWindow = textarea; + } + + /** + * A private method for actually writing the messages. + * It is synchronized because the weblech spider is multi + * threaded. + */ + private static synchronized void log ( String msg ) { + if ( _loggerWindow == null ) { + //System.out.println ( msg ); + return; + } + _loggerWindow.setEditable ( true ); + _loggerWindow.append ( msg ); + _loggerWindow.append ( "\n" ); + _loggerWindow.setEditable ( false ); + } + + /** + * Toggle error logging. + */ + public static void setErrorLogging() { + error_enable = !error_enable; + } + + /** + * Log an error message. + */ + public static void error ( String msg ) { + if ( !error_enable ) return; + log ( "*error>" + msg ); + } + + /** + * Log an error message and an exception. + */ + public static void error ( String msg, Exception exception ) { + if ( !error_enable ) return; + log ( "*error>" + msg + "\n" + exception.getMessage() ); + } + + /** + * Toggle informational messages. + */ + public static void setInformationalLogging() { + info_enable = !info_enable; + } + + /** + * Log an informational message. + */ + public static void info ( String msg ) { + if ( !info_enable ) return; + log ( "^info>" + msg ); + } + + /** + * Toggle warning messages. + */ + public static void setWarningLogging() { + warn_enable = !warn_enable; + } + + /** + * Log a warning message. + */ + public static void warn ( String msg ) { + if ( !warn_enable ) return; + log ( "-warn>" + msg ); + } + + /** + * Log a warning message with an exception. + */ + public static void warn ( String msg, IOException exception ) { + if ( !warn_enable ) return; + log ( "-warn>" + msg + "\n" + exception.getMessage() ); + } + + /** + * Toggle debug messages to be printed. + */ + public static void setDebugLogging() { + debug_enable = !debug_enable; + } + + /** + * Log a deubgging statement to the logging text area. + */ + public static void debug ( String msg ) { + if ( !debug_enable ) return; + log ( "@debug>" + msg ); + } +} \ No newline at end of file diff --git a/weblech/ui/SpiderConfigPanel.java b/weblech/ui/SpiderConfigPanel.java new file mode 100755 index 0000000..4cb6164 --- /dev/null +++ b/weblech/ui/SpiderConfigPanel.java @@ -0,0 +1,251 @@ +/** + * SpiderConfigPanel.java: A graphcial panel for configuring a SpiderConfig object. + * This panel provides a way to change the more practical options of the WebLech Spider. + * It supports saving and opening of SpiderConfigurations from a file. It does not use + * any of the "interesting" or "boring" url features, or the email link save file. + */ +package weblech.ui; + +import weblech.spider.Spider; +import weblech.spider.SpiderConfig; + +import javax.swing.JPanel; +import javax.swing.JTextField; +import javax.swing.JLabel; +import javax.swing.JButton; +import javax.swing.JComboBox; +import javax.swing.JFileChooser; + +import java.awt.GridLayout; +import java.awt.FlowLayout; + +import java.awt.event.ActionListener; +import java.awt.event.ActionEvent; + +import java.util.Properties; +import java.util.ArrayList; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; + +public class SpiderConfigPanel extends JPanel implements ActionListener { + + /** + * A list of all of the spiders that the GUI will have downloading. It is assumed + * that the user knows how much bandwidth you have and really wants to try and get + * several different sites at the same time. + */ + private ArrayList spiders; + /** + * Various text fields for the configuration options. + */ + private JTextField sitenamefield, dirfield, usernamefield, passwordfield, agentfield, depthfield, matchfield; + /** + * A selection box for the number of threads a new Spider should use, I am limiting + * the number of threads to 4 for simplicity. + */ + private JComboBox threadbox; + + public SpiderConfigPanel() { + super ( new GridLayout ( 8, 1 ) ); + spiders = new ArrayList(); + + /* Panel for the directory to save all files */ + JPanel sitepanel = new JPanel(); + ((FlowLayout) sitepanel.getLayout()).setAlignment ( FlowLayout.RIGHT ); + JLabel sitelabel = new JLabel ( "Output Directory:" ); + dirfield = new JTextField ( System.getProperty ( "user.home" ), 20 ); + sitepanel.add ( sitelabel ); + sitepanel.add ( dirfield ); + + /* Panel for the site to download */ + JPanel outputpanel = new JPanel(); + ((FlowLayout) outputpanel.getLayout()).setAlignment ( FlowLayout.RIGHT ); + JLabel dirlabel = new JLabel ( "Download Site:" ); + sitenamefield = new JTextField ( "http://www.google.com/", 20 ); + outputpanel.add ( dirlabel ); + outputpanel.add ( sitenamefield ); + + /* Panel for the HTTP username */ + JPanel usernamepanel = new JPanel(); + ((FlowLayout) usernamepanel.getLayout()).setAlignment ( FlowLayout.RIGHT ); + JLabel usernamelabel = new JLabel ( "Username:" ); + usernamefield = new JTextField ( "", 20 ); + usernamepanel.add ( usernamelabel ); + usernamepanel.add ( usernamefield ); + + /* Panel for the HTTP password */ + JPanel passpanel = new JPanel(); + ((FlowLayout) passpanel.getLayout()).setAlignment ( FlowLayout.RIGHT ); + JLabel passwdlabel = new JLabel ( "Password:" ); + passwordfield = new JTextField ( "", 20 ); + passpanel.add ( passwdlabel ); + passpanel.add ( passwordfield ); + + /* Panel for the HTTP user agent */ + JPanel agentpanel = new JPanel(); + ((FlowLayout) agentpanel.getLayout()).setAlignment ( FlowLayout.RIGHT ); + JLabel agentlabel = new JLabel ( "User Agent:" ); + agentfield = new JTextField ( "WebLech [Version C]", 20 ); + agentpanel.add ( agentlabel ); + agentpanel.add ( agentfield ); + + /* Panel for a simple string match downloading limiter (no match, no download) */ + JPanel matchpanel = new JPanel(); + ((FlowLayout) matchpanel.getLayout()).setAlignment ( FlowLayout.RIGHT ); + JLabel matchlabel = new JLabel ( "Match String:" ); + matchfield = new JTextField ( "", 20 ); + matchpanel.add ( matchlabel ); + matchpanel.add ( matchfield ); + + /* Provides a panel for placing both the maximum depth and threads for this spider */ + JPanel detailpanel = new JPanel ( new GridLayout ( 1, 2 ) ); + + JPanel depthpanel = new JPanel(); + ((FlowLayout) depthpanel.getLayout()).setAlignment ( FlowLayout.RIGHT ); + JLabel depthlabel = new JLabel ( "Max Depth:" ); + depthfield = new JTextField ( Integer.toString ( 0 ), 5 ); + depthpanel.add ( depthlabel ); + depthpanel.add ( depthfield ); + + JPanel threadpanel = new JPanel(); + JLabel threadlabel = new JLabel ( "Spider Threads:" ); + Integer[] threaditems = { new Integer ( "1" ), new Integer ( "2" ), new Integer ( "3" ), new Integer ( "4" ) }; + threadbox = new JComboBox ( threaditems ); + threadpanel.add ( threadlabel ); + threadpanel.add ( threadbox ); + + detailpanel.add ( depthpanel ); + detailpanel.add ( threadpanel ); + + /* Panel of buttons for various operations */ + JPanel buttonpanel = new JPanel(); + JButton save = new JButton ( "Save" ); + JButton spiderbutton = new JButton ( "Spider It" ); + JButton open = new JButton ( "Open" ); + JButton qbutton = new JButton ( "Quit" ); + buttonpanel.add ( save ); + buttonpanel.add ( spiderbutton ); + buttonpanel.add ( open ); + buttonpanel.add ( qbutton ); + + add ( sitepanel ); + add ( outputpanel ); + add ( usernamepanel ); + add ( passpanel ); + add ( agentpanel ); + add ( matchpanel ); + add ( detailpanel ); + add ( buttonpanel ); + + /* Configure the button actions */ + save.setActionCommand ( "save" ); + open.setActionCommand ( "open" ); + spiderbutton.setActionCommand ( "spider" ); + qbutton.setActionCommand ( "quit" ); + save.addActionListener ( this ); + open.addActionListener ( this ); + spiderbutton.addActionListener ( this ); + qbutton.addActionListener ( this ); + LechLogger.debug ( "Actions" ); + } + + /** + * This method will create a Properties object good for instantiating a new SpiderConfig + * Object. + */ + private Properties createProperties() { + Properties p = new Properties(); + p.setProperty ( "saveRootDirectory", dirfield.getText() ); + p.setProperty ( "startLocation", sitenamefield.getText() ); + p.setProperty ( "basicAuthUser", usernamefield.getText() ); + p.setProperty ( "basicAuthPassword", passwordfield.getText() ); + p.setProperty ( "urlMatch", matchfield.getText() ); + p.setProperty ( "spiderThreads", ((Integer) threadbox.getSelectedItem()).toString() ); + p.setProperty ( "maxDepth", depthfield.getText() ); + p.setProperty ( "userAgent", agentfield.getText() ); + p.setProperty ( "interestingURLs", "" ); + return p; + } + + /** + * This method will extract all of the values from a SpiderConfig object that the GUI uses + * and updates the panel to show the values in the object. + */ + private void setSpiderConfig ( SpiderConfig sc ) { + dirfield.setText ( sc.getSaveRootDirectory().toString() ); + sitenamefield.setText ( sc.getStartLocation().toString() ); + usernamefield.setText ( sc.getBasicAuthUser() ); + passwordfield.setText ( sc.getBasicAuthPassword() ); + matchfield.setText ( sc.getURLMatch() ); + int t = sc.getSpiderThreads(); + if ( t < 1 || t > 4 ) { + t = 1; + sc.setSpiderThreads ( t ); + } + threadbox.setSelectedIndex ( t - 1 ); + depthfield.setText ( Integer.toString ( sc.getMaxDepth() ) ); + agentfield.setText ( sc.getUserAgent() ); + } + + /** + * This method will coordinate all of the actions for the various buttons used. + */ + public void actionPerformed ( ActionEvent event ) { + String cmd = event.getActionCommand(); + /* Download a new site */ + if ( cmd.equals ( "spider" ) ) { + SpiderConfig c = new SpiderConfig ( createProperties() ); + Spider spider = new Spider ( c ); + /* But only if we are not already downloading the site */ + if ( spiders.contains ( spider ) ) { + LechLogger.warn ( "Already have an instance of a Spider at " + c.getStartLocation() ); + return; + } + spiders.add ( spider ); + spider.start(); + } + /* Save the current configuration to a file */ + else if ( cmd.equals ( "save" ) ) { + JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) ); + int r = f.showSaveDialog ( this ); + if ( r != JFileChooser.APPROVE_OPTION ) return; + File outfile = f.getSelectedFile(); + try { + ObjectOutputStream os = new ObjectOutputStream ( new FileOutputStream ( outfile ) ); + os.writeObject ( new SpiderConfig ( createProperties() ) ); + os.close(); + } + catch ( Exception exception ) { + LechLogger.error ( exception.toString() ); + } + } + /* Open a saved configuration from a file */ + else if ( cmd.equals ( "open" ) ) { + JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) ); + int r = f.showOpenDialog ( this ); + if ( r != JFileChooser.APPROVE_OPTION ) return; + File infile = f.getSelectedFile(); + if ( !infile.canRead() ) { + LechLogger.error ( "file " + f.toString() + " is not readable" ); + return; + } + try { + ObjectInputStream os = new ObjectInputStream ( new FileInputStream ( infile ) ); + SpiderConfig sc = (SpiderConfig) os.readObject(); + os.close(); + setSpiderConfig ( sc ); + } + catch ( Exception exception ) { + LechLogger.error ( exception.toString() ); + } + } + /* Just quit */ + else if ( cmd.equals ( "quit" ) ) { + System.exit ( 0 ); + } + } +} \ No newline at end of file diff --git a/weblech/ui/Troll.java b/weblech/ui/Troll.java new file mode 100755 index 0000000..47c2432 --- /dev/null +++ b/weblech/ui/Troll.java @@ -0,0 +1,113 @@ +/** + * Troll.java: A user interface to the weblech spider download utility. + */ +package weblech.ui; + +/* I like to explicitly import all of my packages to remind me to KISS */ +import javax.swing.JFrame; +import javax.swing.JPanel; +import javax.swing.JTabbedPane; +import javax.swing.JTextArea; +import javax.swing.JScrollPane; +import javax.swing.JMenuBar; +import javax.swing.JMenu; +import javax.swing.JMenuItem; +import javax.swing.JCheckBoxMenuItem; + +import java.awt.Dimension; + +import java.awt.event.ActionListener; +import java.awt.event.ActionEvent; + +public class Troll extends JFrame implements ActionListener { + + /** + * This SpiderConfigPanel is a custom panel that provides many of the more + * practical features of the weblech spider. It also controls the spiders + * created by the user. + */ + private SpiderConfigPanel configpanel; + /** + * This is the area that all of the logging facilities will use. This makes debugging + * in a system like Max OS X much simpler (:^) + */ + private static JTextArea logarea; + + /* This just initializes the logging text box and readies it for recording events before + * the rest of the object is even loaded. + */ + static { + logarea = new JTextArea(); + LechLogger.setTextArea ( logarea ); + LechLogger.setDebugLogging(); + } + + Troll() { + super ( "Troll" ); + Dimension initialsize = new Dimension ( 400, 375 ); + setSize ( initialsize); + + /* Create a menubar for controlling which aspects of the log you wish to see */ + JMenuBar menubar = new JMenuBar(); + JMenu logmenu = new JMenu ( "Log Options" ); + JCheckBoxMenuItem showdebug = new JCheckBoxMenuItem ( "Show Debug Messages", false ); + showdebug.setActionCommand ( "debug" ); + showdebug.addActionListener ( this ); + JCheckBoxMenuItem showinfo = new JCheckBoxMenuItem ( "Show Informational Messages", true ); + showinfo.setActionCommand ( "info" ); + showinfo.addActionListener ( this ); + JCheckBoxMenuItem showwarn = new JCheckBoxMenuItem ( "Show Warnings", true ); + showwarn.setActionCommand ( "warn" ); + showwarn.addActionListener ( this ); + JCheckBoxMenuItem showerror = new JCheckBoxMenuItem ( "Show Errors", true ); + showerror.setActionCommand ( "error" ); + showerror.addActionListener ( this ); + logmenu.add ( showdebug ); + logmenu.add ( showinfo ); + logmenu.add ( showwarn ); + logmenu.add ( showerror ); + menubar.add ( logmenu ); + + /* A simple tab interface between configuration and error checking */ + configpanel = new SpiderConfigPanel(); + JPanel logpanel = new JPanel(); + logpanel.add ( logarea ); + JScrollPane logscroller = new JScrollPane ( logpanel ); + JTabbedPane tabs = new JTabbedPane(); + tabs.addTab ( "Spider", configpanel ); + tabs.addTab ( "Log", logscroller ); + + /* Configure the JFrame to a usable state */ + setJMenuBar ( menubar ); + getContentPane().add ( tabs ); + setLocationRelativeTo ( null ); + setVisible ( true ); + setDefaultCloseOperation ( JFrame.EXIT_ON_CLOSE ); + } + + /** + * This method basically toggles all of the logging options. + */ + public void actionPerformed ( ActionEvent event ) { + String cmd = event.getActionCommand(); + if ( cmd.equals ( "debug" ) ) { + LechLogger.setDebugLogging(); + } + else if ( cmd.equals ( "info" ) ) { + LechLogger.setInformationalLogging(); + } + else if ( cmd.equals ( "warn" ) ) { + LechLogger.setDebugLogging(); + } + else if ( cmd.equals ( "error" ) ) { + LechLogger.setErrorLogging(); + } + } + + /** + * Create a new troll and go + */ + public static void main ( String[] args ) { + Troll t = new Troll(); + } +}