Initial revision
commit
80c48012cd
|
@ -0,0 +1,26 @@
|
||||||
|
WebLech license information.
|
||||||
|
============================
|
||||||
|
|
||||||
|
This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
|
||||||
|
Copyright (c) 2001 Brian Pitcher
|
||||||
|
Copyright (c) 2004 Andrew Coleman
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
copy of this software and associated documentation files (the "Software"),
|
||||||
|
to deal in the Software without restriction, including without limitation
|
||||||
|
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
Software is furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
<project name="WebLech" default="dist" basedir=".">
|
||||||
|
<description>
|
||||||
|
WebLech - a tool for downloading the web
|
||||||
|
</description>
|
||||||
|
|
||||||
|
<property name="src" location="weblech" />
|
||||||
|
<property name="build" location="build" />
|
||||||
|
<property name="dist" location="jars" />
|
||||||
|
|
||||||
|
<target name="init">
|
||||||
|
<tstamp />
|
||||||
|
<mkdir dir="${build}" />
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="compile" depends="init">
|
||||||
|
<javac srcdir="${src}" destdir="${build}" />
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="dist" depends="compile">
|
||||||
|
<mkdir dir="${dist}" />
|
||||||
|
<manifest file="${dist}/MANIFEST.MF">
|
||||||
|
<attribute name="Main-Class" value="weblech.ui.Troll" />
|
||||||
|
</manifest>
|
||||||
|
<jar manifest="${dist}/MANIFEST.MF" jarfile="${dist}/WebLech-${DSTAMP}.jar" basedir="${build}" />
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="clean">
|
||||||
|
<delete dir="${build}" />
|
||||||
|
<delete dir="${dist}" />
|
||||||
|
</target>
|
||||||
|
</project>
|
|
@ -0,0 +1,18 @@
|
||||||
|
/*
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: Michael Mason
|
||||||
|
* Date: Jun 5, 2002
|
||||||
|
* Time: 6:43:04 PM
|
||||||
|
* To change template for new interface use
|
||||||
|
* Code Style | Class Templates options (Tools | IDE Options).
|
||||||
|
*/
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
public interface Constants
|
||||||
|
{
|
||||||
|
|
||||||
|
/** How often to check the queue status */
|
||||||
|
int QUEUE_CHECK_INTERVAL = 500;
|
||||||
|
/** How long to pause for threads to finish before exitting */
|
||||||
|
int SPIDER_STOP_PAUSE = 500;
|
||||||
|
}
|
|
@ -0,0 +1,143 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/DownloadQueue.java,v 1.1 2004/03/07 20:51:05 mercury Exp $
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class DownloadQueue implements Serializable
|
||||||
|
{
|
||||||
|
private SpiderConfig config;
|
||||||
|
|
||||||
|
private List interestingURLsToDownload;
|
||||||
|
private List averageURLsToDownload;
|
||||||
|
private List boringURLsToDownload;
|
||||||
|
private Set urlsInQueue;
|
||||||
|
|
||||||
|
public DownloadQueue(SpiderConfig config)
|
||||||
|
{
|
||||||
|
this.config = config;
|
||||||
|
interestingURLsToDownload = new ArrayList();
|
||||||
|
averageURLsToDownload = new ArrayList();
|
||||||
|
boringURLsToDownload = new ArrayList();
|
||||||
|
urlsInQueue = new HashSet();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void queueURL(URLToDownload url)
|
||||||
|
{
|
||||||
|
URL u = url.getURL();
|
||||||
|
if(urlsInQueue.contains(u))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(config.isInteresting(u))
|
||||||
|
{
|
||||||
|
if(config.isDepthFirstSearch())
|
||||||
|
{
|
||||||
|
interestingURLsToDownload.add(0, url);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
interestingURLsToDownload.add(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(config.isBoring(u))
|
||||||
|
{
|
||||||
|
if(config.isDepthFirstSearch())
|
||||||
|
{
|
||||||
|
boringURLsToDownload.add(0, url);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
boringURLsToDownload.add(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if(config.isDepthFirstSearch())
|
||||||
|
{
|
||||||
|
averageURLsToDownload.add(0, url);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
averageURLsToDownload.add(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
urlsInQueue.add(u);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void queueURLs(Collection urls)
|
||||||
|
{
|
||||||
|
for(Iterator i = urls.iterator(); i.hasNext(); )
|
||||||
|
{
|
||||||
|
URLToDownload u2d = (URLToDownload) i.next();
|
||||||
|
queueURL(u2d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public URLToDownload getNextInQueue()
|
||||||
|
{
|
||||||
|
if(interestingURLsToDownload.size() > 0)
|
||||||
|
{
|
||||||
|
return returnURLFrom(interestingURLsToDownload);
|
||||||
|
}
|
||||||
|
else if(averageURLsToDownload.size() > 0)
|
||||||
|
{
|
||||||
|
return returnURLFrom(averageURLsToDownload);
|
||||||
|
}
|
||||||
|
else if(boringURLsToDownload.size() > 0)
|
||||||
|
{
|
||||||
|
return returnURLFrom(boringURLsToDownload);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private URLToDownload returnURLFrom(List urlList)
|
||||||
|
{
|
||||||
|
URLToDownload u2d = (URLToDownload) urlList.get(0);
|
||||||
|
urlList.remove(0);
|
||||||
|
urlsInQueue.remove(u2d.getURL());
|
||||||
|
return u2d;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size()
|
||||||
|
{
|
||||||
|
return interestingURLsToDownload.size() + averageURLsToDownload.size() + boringURLsToDownload.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString()
|
||||||
|
{
|
||||||
|
return size() + " URLs";
|
||||||
|
}
|
||||||
|
|
||||||
|
} // End class DownloadQueue
|
|
@ -0,0 +1,50 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
* Copyright (c) 2004 Andrew Coleman
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import java.net.Authenticator;
|
||||||
|
import java.net.PasswordAuthentication;
|
||||||
|
|
||||||
|
import weblech.ui.LechLogger;
|
||||||
|
|
||||||
|
public class DumbAuthenticator extends Authenticator
|
||||||
|
{
|
||||||
|
private final String user;
|
||||||
|
private final String password;
|
||||||
|
|
||||||
|
public DumbAuthenticator(String user, String password)
|
||||||
|
{
|
||||||
|
LechLogger.debug("DumbAuthenticator(" + user + ", ***)");
|
||||||
|
this.user = user;
|
||||||
|
this.password = password;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PasswordAuthentication getPasswordAuthentication()
|
||||||
|
{
|
||||||
|
LechLogger.debug("getPasswordAuthentication()");
|
||||||
|
return new PasswordAuthentication(user, password.toCharArray());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,188 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
* Copyright (c) 2004 Andrew Coleman
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
|
import weblech.ui.LechLogger;
|
||||||
|
|
||||||
|
public class HTMLParser
|
||||||
|
{
|
||||||
|
private SpiderConfig config;
|
||||||
|
|
||||||
|
public HTMLParser(SpiderConfig config)
|
||||||
|
{
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List parseLinksInDocument(URL sourceURL, String textContent)
|
||||||
|
{
|
||||||
|
return parseAsHTML(sourceURL, textContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List parseAsHTML(URL sourceURL, String textContent)
|
||||||
|
{
|
||||||
|
LechLogger.debug("parseAsHTML()");
|
||||||
|
ArrayList newURLs = new ArrayList();
|
||||||
|
HashSet newURLSet = new HashSet();
|
||||||
|
|
||||||
|
/* note from coleman:
|
||||||
|
* I had to add a few tags into this, namely the link and embeds. weblech should download flash
|
||||||
|
* movies, mpegs, avis, and anything else that it finds on the page. even stylesheets :)
|
||||||
|
*/
|
||||||
|
extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("link", "href", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("embed", "src", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("LINK", "HREF", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
extractAttributesFromTags("EMBED", "SRC", sourceURL, newURLs, newURLSet, textContent);
|
||||||
|
|
||||||
|
if(newURLs.size() == 0)
|
||||||
|
{
|
||||||
|
LechLogger.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);
|
||||||
|
}
|
||||||
|
LechLogger.debug("Returning " + newURLs.size() + " urls extracted from page");
|
||||||
|
return newURLs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void extractAttributesFromTags(String tag, String attr, URL sourceURL, List newURLs, Set newURLSet, String input)
|
||||||
|
{
|
||||||
|
LechLogger.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)");
|
||||||
|
|
||||||
|
int startPos = 0;
|
||||||
|
String startTag = "<" + tag + " ";
|
||||||
|
String attrStr = attr + "=\"";
|
||||||
|
while(true)
|
||||||
|
{
|
||||||
|
int tagPos = input.indexOf(startTag, startPos);
|
||||||
|
if(tagPos < 0)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int attrPos = input.indexOf(attrStr, tagPos + 1);
|
||||||
|
if(attrPos < 0)
|
||||||
|
{
|
||||||
|
startPos = tagPos + 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int nextClosePos = input.indexOf(">", tagPos + 1);
|
||||||
|
if(attrPos < nextClosePos)
|
||||||
|
{
|
||||||
|
// Ooh, found one
|
||||||
|
int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1);
|
||||||
|
if(closeQuotePos > 0)
|
||||||
|
{
|
||||||
|
String urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos);
|
||||||
|
if(urlStr.indexOf('#') != -1)
|
||||||
|
{
|
||||||
|
urlStr = urlStr.substring(0, urlStr.indexOf('#'));
|
||||||
|
}
|
||||||
|
//LechLogger.debug("Found possible URL string: " + URL);
|
||||||
|
|
||||||
|
if(isMailTo(urlStr))
|
||||||
|
{
|
||||||
|
logMailURL(urlStr);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
|
||||||
|
URL u = new URL(sourceURL, urlStr);
|
||||||
|
if(newURLSet.contains(u))
|
||||||
|
{
|
||||||
|
//LechLogger.debug("Already found URL on page: " + u);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
newURLs.add(u);
|
||||||
|
newURLSet.add(u);
|
||||||
|
//LechLogger.debug("Found new URL on page: " + u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch(MalformedURLException murle)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
startPos = tagPos + 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
startPos = tagPos + 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void logMailURL(String url)
|
||||||
|
{
|
||||||
|
LechLogger.debug("logMailURL()");
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true);
|
||||||
|
PrintWriter pW = new PrintWriter(appendedFile);
|
||||||
|
pW.println(url);
|
||||||
|
pW.flush();
|
||||||
|
pW.close();
|
||||||
|
}
|
||||||
|
catch(IOException ioe)
|
||||||
|
{
|
||||||
|
LechLogger.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a particular URL looks like it's a mailto: style link.
|
||||||
|
*/
|
||||||
|
private boolean isMailTo(String url)
|
||||||
|
{
|
||||||
|
if(url == null)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
url = url.toUpperCase();
|
||||||
|
return (url.indexOf("MAILTO:") != -1);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,333 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
* Copyright (c) 2004 Andrew Coleman
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import weblech.ui.LechLogger;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.*;
|
||||||
|
import java.net.URL;
|
||||||
|
|
||||||
|
public class Spider implements Runnable, Constants
|
||||||
|
{
|
||||||
|
/** Config for the spider */
|
||||||
|
private SpiderConfig config;
|
||||||
|
/**
|
||||||
|
* Download queue.
|
||||||
|
* Thread safety: To access the queue, first synchronize on it.
|
||||||
|
*/
|
||||||
|
private DownloadQueue queue;
|
||||||
|
/**
|
||||||
|
* Set of URLs downloaded or scheduled, so we don't download a
|
||||||
|
* URL more than once.
|
||||||
|
* Thread safety: To access the set, first synchronize on it.
|
||||||
|
*/
|
||||||
|
private Set urlsDownloadedOrScheduled;
|
||||||
|
/**
|
||||||
|
* Set of URLs currently being downloaded by Spider threads.
|
||||||
|
* Thread safety: To access the set, first synchronize on it.
|
||||||
|
*/
|
||||||
|
private Set urlsDownloading;
|
||||||
|
/**
|
||||||
|
* Number of downloads currently taking place.
|
||||||
|
* Thread safety: To modify this value, first synchronize on
|
||||||
|
* the download queue.
|
||||||
|
*/
|
||||||
|
private int downloadsInProgress;
|
||||||
|
/** Whether the spider should quit */
|
||||||
|
private boolean quit;
|
||||||
|
/** Count of running Spider threads. */
|
||||||
|
private int running;
|
||||||
|
/** Time we last checkpointed. */
|
||||||
|
private long lastCheckpoint;
|
||||||
|
|
||||||
|
public Spider(SpiderConfig config)
|
||||||
|
{
|
||||||
|
this.config = config;
|
||||||
|
queue = new DownloadQueue(config);
|
||||||
|
queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
|
||||||
|
urlsDownloadedOrScheduled = new HashSet();
|
||||||
|
urlsDownloading = new HashSet();
|
||||||
|
downloadsInProgress = 0;
|
||||||
|
lastCheckpoint = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void start()
|
||||||
|
{
|
||||||
|
quit = false;
|
||||||
|
running = 0;
|
||||||
|
|
||||||
|
for(int i = 0; i < config.getSpiderThreads(); i++)
|
||||||
|
{
|
||||||
|
LechLogger.info("Starting Spider thread");
|
||||||
|
Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
|
||||||
|
t.start();
|
||||||
|
running++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void stop()
|
||||||
|
{
|
||||||
|
quit = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isRunning()
|
||||||
|
{
|
||||||
|
return running == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkpointIfNeeded()
|
||||||
|
{
|
||||||
|
if(config.getCheckpointInterval() == 0)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
|
||||||
|
{
|
||||||
|
synchronized(queue)
|
||||||
|
{
|
||||||
|
if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
|
||||||
|
{
|
||||||
|
writeCheckpoint();
|
||||||
|
lastCheckpoint = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeCheckpoint()
|
||||||
|
{
|
||||||
|
LechLogger.debug("writeCheckpoint()");
|
||||||
|
try
|
||||||
|
{
|
||||||
|
FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
|
||||||
|
ObjectOutputStream oos = new ObjectOutputStream(fos);
|
||||||
|
oos.writeObject(queue);
|
||||||
|
oos.writeObject(urlsDownloading);
|
||||||
|
oos.close();
|
||||||
|
}
|
||||||
|
catch(IOException ioe)
|
||||||
|
{
|
||||||
|
LechLogger.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void readCheckpoint()
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
FileInputStream fis = new FileInputStream("spider.checkpoint");
|
||||||
|
ObjectInputStream ois = new ObjectInputStream(fis);
|
||||||
|
queue = (DownloadQueue) ois.readObject();
|
||||||
|
urlsDownloading = (Set) ois.readObject();
|
||||||
|
queue.queueURLs(urlsDownloading);
|
||||||
|
urlsDownloading.clear();
|
||||||
|
}
|
||||||
|
catch(Exception e)
|
||||||
|
{
|
||||||
|
LechLogger.error("Caught exception reading checkpoint: " + e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run()
|
||||||
|
{
|
||||||
|
HTMLParser htmlParser = new HTMLParser(config);
|
||||||
|
URLGetter urlGetter = new URLGetter(config);
|
||||||
|
|
||||||
|
while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)
|
||||||
|
{
|
||||||
|
checkpointIfNeeded();
|
||||||
|
if(queueSize() == 0 && downloadsInProgress > 0)
|
||||||
|
{
|
||||||
|
// Wait for a download to finish before seeing if this thread should stop
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Thread.sleep(QUEUE_CHECK_INTERVAL);
|
||||||
|
}
|
||||||
|
catch(InterruptedException ignored)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
// Have another go at the loop
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if(queueSize() == 0)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
URLToDownload nextURL;
|
||||||
|
synchronized(queue)
|
||||||
|
{
|
||||||
|
nextURL = queue.getNextInQueue();
|
||||||
|
downloadsInProgress++;
|
||||||
|
}
|
||||||
|
synchronized(urlsDownloading)
|
||||||
|
{
|
||||||
|
urlsDownloading.add(nextURL);
|
||||||
|
}
|
||||||
|
int newDepth = nextURL.getDepth() + 1;
|
||||||
|
int maxDepth = config.getMaxDepth();
|
||||||
|
synchronized(urlsDownloading)
|
||||||
|
{
|
||||||
|
urlsDownloading.remove(nextURL);
|
||||||
|
}
|
||||||
|
List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
|
||||||
|
|
||||||
|
newURLs = filterURLs(newURLs);
|
||||||
|
|
||||||
|
ArrayList u2dsToQueue = new ArrayList();
|
||||||
|
for(Iterator i = newURLs.iterator(); i.hasNext(); )
|
||||||
|
{
|
||||||
|
URL u = (URL) i.next();
|
||||||
|
// Download if not yet downloaded, and the new depth is less than the maximum
|
||||||
|
synchronized(urlsDownloadedOrScheduled)
|
||||||
|
{
|
||||||
|
if(!urlsDownloadedOrScheduled.contains(u)
|
||||||
|
&& (maxDepth == 0 || newDepth <= maxDepth))
|
||||||
|
{
|
||||||
|
u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));
|
||||||
|
urlsDownloadedOrScheduled.add(u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
synchronized(queue)
|
||||||
|
{
|
||||||
|
queue.queueURLs(u2dsToQueue);
|
||||||
|
downloadsInProgress--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LechLogger.info("Spider thread stopping [" + config.getStartLocation() + "]" );
|
||||||
|
running--;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the size of the download queue in a thread-safe manner.
|
||||||
|
*/
|
||||||
|
private int queueSize()
|
||||||
|
{
|
||||||
|
synchronized(queue)
|
||||||
|
{
|
||||||
|
return queue.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a URL, and return new URLs that are referenced from it.
|
||||||
|
*
|
||||||
|
* @return A List of URL objects.
|
||||||
|
*/
|
||||||
|
private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)
|
||||||
|
{
|
||||||
|
LechLogger.debug("downloadURL(" + url + ")");
|
||||||
|
|
||||||
|
// Bail out early if image and already on disk
|
||||||
|
URLObject obj = new URLObject(url.getURL(), config);
|
||||||
|
if(obj.existsOnDisk())
|
||||||
|
{
|
||||||
|
if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))
|
||||||
|
{
|
||||||
|
LechLogger.info("Q: [" + queue + "] " + url);
|
||||||
|
obj = urlGetter.getURL(url);
|
||||||
|
}
|
||||||
|
else if(config.refreshImages() && obj.isImage())
|
||||||
|
{
|
||||||
|
LechLogger.info("Q: [" + queue + "] " + url);
|
||||||
|
obj = urlGetter.getURL(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LechLogger.info("Q: [" + queue + "] " + url);
|
||||||
|
obj = urlGetter.getURL(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(obj == null)
|
||||||
|
{
|
||||||
|
return new ArrayList();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!obj.existsOnDisk())
|
||||||
|
{
|
||||||
|
obj.writeToFile();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(obj.isHTML() || obj.isXML())
|
||||||
|
{
|
||||||
|
return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());
|
||||||
|
}
|
||||||
|
else if(obj.isImage())
|
||||||
|
{
|
||||||
|
return new ArrayList();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LechLogger.warn("Unknown content type received: " + obj.getContentType());
|
||||||
|
LechLogger.info("URL was " + url);
|
||||||
|
return new ArrayList();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List filterURLs(List URLs)
|
||||||
|
{
|
||||||
|
String match = config.getURLMatch();
|
||||||
|
ArrayList retVal = new ArrayList();
|
||||||
|
|
||||||
|
synchronized(urlsDownloadedOrScheduled)
|
||||||
|
{
|
||||||
|
for(Iterator i = URLs.iterator(); i.hasNext(); )
|
||||||
|
{
|
||||||
|
URL u = (URL) i.next();
|
||||||
|
if(urlsDownloadedOrScheduled.contains(u))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String s = u.toString();
|
||||||
|
if(s.indexOf(match) != -1)
|
||||||
|
{
|
||||||
|
retVal.add(u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return retVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Method By Coleman
|
||||||
|
* A basic check to see if there is another spider downloading the same thing
|
||||||
|
*/
|
||||||
|
protected boolean compareSpiderConfig ( SpiderConfig sc ) {
|
||||||
|
return config.getStartLocation().equals ( sc.getStartLocation() );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Method By Coleman
|
||||||
|
* A method to determine if one spider is downloading the same file as another spider
|
||||||
|
*/
|
||||||
|
public boolean equals ( Object o ) {
|
||||||
|
if ( !o.getClass().isInstance ( this ) ) return false;
|
||||||
|
return ((Spider) o).compareSpiderConfig ( config );
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,464 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
* Copyright (c) 2004 Andrew Coleman
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import weblech.ui.LechLogger;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
|
||||||
|
public class SpiderConfig implements Serializable
|
||||||
|
{
|
||||||
|
private File saveRootDirectory;
|
||||||
|
private File mailtoLogFile;
|
||||||
|
|
||||||
|
private boolean refreshHTMLs;
|
||||||
|
private boolean refreshImages;
|
||||||
|
private boolean refreshOthers;
|
||||||
|
|
||||||
|
private Set htmlExtensions;
|
||||||
|
private Set imageExtensions;
|
||||||
|
|
||||||
|
private URL startLocation;
|
||||||
|
private String urlMatch;
|
||||||
|
|
||||||
|
private List interestingURLSubstrings;
|
||||||
|
private List boringURLSubstrings;
|
||||||
|
|
||||||
|
private boolean depthFirst;
|
||||||
|
private int maxDepth;
|
||||||
|
|
||||||
|
private String userAgent;
|
||||||
|
|
||||||
|
private String basicAuthUser;
|
||||||
|
private String basicAuthPassword;
|
||||||
|
|
||||||
|
private int spiderThreads;
|
||||||
|
|
||||||
|
private long checkpointInterval;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a default config.
|
||||||
|
*/
|
||||||
|
public SpiderConfig()
|
||||||
|
{
|
||||||
|
LechLogger.debug("SpiderConfig()");
|
||||||
|
|
||||||
|
saveRootDirectory = new File(".");
|
||||||
|
mailtoLogFile = new File("mailto.txt");
|
||||||
|
|
||||||
|
refreshHTMLs = true;
|
||||||
|
refreshImages = false;
|
||||||
|
refreshOthers = false;
|
||||||
|
|
||||||
|
htmlExtensions = new HashSet();
|
||||||
|
htmlExtensions.add("htm");
|
||||||
|
htmlExtensions.add("html");
|
||||||
|
htmlExtensions.add("shtml");
|
||||||
|
|
||||||
|
imageExtensions = new HashSet();
|
||||||
|
imageExtensions.add("jpg");
|
||||||
|
imageExtensions.add("gif");
|
||||||
|
imageExtensions.add("png");
|
||||||
|
/* Added a few image extensions -- Coleman */
|
||||||
|
imageExtensions.add("tiff");
|
||||||
|
imageExtensions.add("bmp");
|
||||||
|
|
||||||
|
urlMatch = null;
|
||||||
|
interestingURLSubstrings = new ArrayList();
|
||||||
|
boringURLSubstrings = new ArrayList();
|
||||||
|
depthFirst = false;
|
||||||
|
maxDepth = 0;
|
||||||
|
|
||||||
|
userAgent = "WebLech Spider [Release C]";
|
||||||
|
basicAuthUser = "";
|
||||||
|
basicAuthPassword = "";
|
||||||
|
|
||||||
|
spiderThreads = 1;
|
||||||
|
|
||||||
|
checkpointInterval = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a config from a java.util.Properties object.
|
||||||
|
*/
|
||||||
|
public SpiderConfig(Properties props)
|
||||||
|
{
|
||||||
|
LechLogger.debug("SpiderConfig(props)");
|
||||||
|
|
||||||
|
saveRootDirectory = new File(props.getProperty("saveRootDirectory", "."));
|
||||||
|
if(!saveRootDirectory.exists())
|
||||||
|
{
|
||||||
|
if(!saveRootDirectory.mkdirs())
|
||||||
|
{
|
||||||
|
LechLogger.error("Couldn't create root directory: " + saveRootDirectory);
|
||||||
|
LechLogger.info("Defaulting to . instead");
|
||||||
|
saveRootDirectory = new File(".");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(!saveRootDirectory.isDirectory())
|
||||||
|
{
|
||||||
|
LechLogger.error("Save root is not a directory: " + saveRootDirectory);
|
||||||
|
LechLogger.info("Defaulting to . instead");
|
||||||
|
saveRootDirectory = new File(".");
|
||||||
|
}
|
||||||
|
|
||||||
|
String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
|
||||||
|
// Check if absolute or relative name given
|
||||||
|
if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
|
||||||
|
{
|
||||||
|
LechLogger.debug("Using absolute file name " + mailtoFileStr);
|
||||||
|
mailtoLogFile = new File(mailtoFileStr);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LechLogger.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
|
||||||
|
mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
|
||||||
|
refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
|
||||||
|
refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();
|
||||||
|
|
||||||
|
htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
|
||||||
|
imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
|
||||||
|
|
||||||
|
String startLocStr = props.getProperty("startLocation");
|
||||||
|
if(startLocStr != null)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
startLocation = new URL(startLocStr);
|
||||||
|
}
|
||||||
|
catch(MalformedURLException murle)
|
||||||
|
{
|
||||||
|
LechLogger.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LechLogger.warn("startLocation not found in properties");
|
||||||
|
}
|
||||||
|
|
||||||
|
urlMatch = props.getProperty("urlMatch");
|
||||||
|
|
||||||
|
interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
|
||||||
|
boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));
|
||||||
|
|
||||||
|
depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
String maxDepthStr = props.getProperty("maxDepth", "0");
|
||||||
|
maxDepth = Integer.parseInt(maxDepthStr);
|
||||||
|
}
|
||||||
|
catch(NumberFormatException nfe)
|
||||||
|
{
|
||||||
|
LechLogger.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
|
||||||
|
maxDepth = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
userAgent = props.getProperty("userAgent", "WebLech Spider [Version C]");
|
||||||
|
basicAuthUser = props.getProperty("basicAuthUser", "");
|
||||||
|
basicAuthPassword = props.getProperty("basicAuthPassword", "");
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
String threadsStr = props.getProperty("spiderThreads", "1");
|
||||||
|
spiderThreads = Integer.parseInt(threadsStr);
|
||||||
|
}
|
||||||
|
catch(NumberFormatException nfe)
|
||||||
|
{
|
||||||
|
LechLogger.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
|
||||||
|
spiderThreads = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
String intervalStr = props.getProperty("checkpointInterval", "0");
|
||||||
|
checkpointInterval = Long.parseLong(intervalStr);
|
||||||
|
}
|
||||||
|
catch(NumberFormatException nfe)
|
||||||
|
{
|
||||||
|
LechLogger.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
|
||||||
|
spiderThreads = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List parsePropCommaSeparated(String str)
|
||||||
|
{
|
||||||
|
ArrayList result = new ArrayList();
|
||||||
|
if(str != null && str.length() > 0)
|
||||||
|
{
|
||||||
|
StringTokenizer tok = new StringTokenizer(str, ",");
|
||||||
|
while(tok.hasMoreTokens())
|
||||||
|
{
|
||||||
|
result.add(tok.nextToken());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setRefreshHTMLs(boolean refreshHTMLs)
|
||||||
|
{
|
||||||
|
this.refreshHTMLs = refreshHTMLs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean refreshHTMLs()
|
||||||
|
{
|
||||||
|
return refreshHTMLs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRefreshImages(boolean refreshImages)
|
||||||
|
{
|
||||||
|
this.refreshImages = refreshImages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean refreshImages()
|
||||||
|
{
|
||||||
|
return refreshImages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRefreshOthers(boolean refreshOthers)
|
||||||
|
{
|
||||||
|
this.refreshOthers = refreshOthers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean refreshOthers()
|
||||||
|
{
|
||||||
|
return refreshOthers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSaveRootDirectory(File saveRootDirectory)
|
||||||
|
{
|
||||||
|
this.saveRootDirectory = saveRootDirectory;
|
||||||
|
}
|
||||||
|
|
||||||
|
public File getSaveRootDirectory()
|
||||||
|
{
|
||||||
|
return saveRootDirectory;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMailtoLogFile(File mailtoLogFile)
|
||||||
|
{
|
||||||
|
this.mailtoLogFile = mailtoLogFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
public File getMailtoLogFile()
|
||||||
|
{
|
||||||
|
return mailtoLogFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStartLocation(URL startLocation)
|
||||||
|
{
|
||||||
|
this.startLocation = startLocation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public URL getStartLocation()
|
||||||
|
{
|
||||||
|
return startLocation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setURLMatch(String urlMatch)
|
||||||
|
{
|
||||||
|
this.urlMatch = urlMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getURLMatch()
|
||||||
|
{
|
||||||
|
return urlMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getInterestingURLSubstrings()
|
||||||
|
{
|
||||||
|
return interestingURLSubstrings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setInterestingURLSubstrings(List interestingURLSubstrings)
|
||||||
|
{
|
||||||
|
this.interestingURLSubstrings = interestingURLSubstrings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getBoringURLSubstrings()
|
||||||
|
{
|
||||||
|
return boringURLSubstrings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBoringURLSubstrings(List boringURLSubstrings)
|
||||||
|
{
|
||||||
|
this.boringURLSubstrings = boringURLSubstrings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isInteresting(URL u)
|
||||||
|
{
|
||||||
|
return matchURL(u, interestingURLSubstrings);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBoring(URL u)
|
||||||
|
{
|
||||||
|
return matchURL(u, boringURLSubstrings);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean matchURL(URL u, List substrings)
|
||||||
|
{
|
||||||
|
String str = u.toExternalForm();
|
||||||
|
for(Iterator i = substrings.iterator(); i.hasNext(); )
|
||||||
|
{
|
||||||
|
String substr = (String) i.next();
|
||||||
|
if(str.indexOf(substr) != -1)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDepthFirstSearch(boolean depthFirst)
|
||||||
|
{
|
||||||
|
this.depthFirst = depthFirst;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isDepthFirstSearch()
|
||||||
|
{
|
||||||
|
return depthFirst;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMaxDepth(int maxDepth)
|
||||||
|
{
|
||||||
|
this.maxDepth = maxDepth;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMaxDepth()
|
||||||
|
{
|
||||||
|
return maxDepth;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUserAgent(String userAgent)
|
||||||
|
{
|
||||||
|
this.userAgent = userAgent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUserAgent()
|
||||||
|
{
|
||||||
|
return userAgent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBasicAuthUser(String basicAuthUser)
|
||||||
|
{
|
||||||
|
this.basicAuthUser = basicAuthUser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBasicAuthUser()
|
||||||
|
{
|
||||||
|
return basicAuthUser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBasicAuthPassword(String basicAuthPassword)
|
||||||
|
{
|
||||||
|
this.basicAuthPassword = basicAuthPassword;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBasicAuthPassword()
|
||||||
|
{
|
||||||
|
return basicAuthPassword;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSpiderThreads(int spiderThreads)
|
||||||
|
{
|
||||||
|
this.spiderThreads = spiderThreads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getSpiderThreads()
|
||||||
|
{
|
||||||
|
return spiderThreads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCheckpointInterval(long interval)
|
||||||
|
{
|
||||||
|
this.checkpointInterval = interval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getCheckpointInterval()
|
||||||
|
{
|
||||||
|
return checkpointInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString()
|
||||||
|
{
|
||||||
|
return "depthFirst:\t" + depthFirst
|
||||||
|
+ "\nmaxDepth:\t" + maxDepth
|
||||||
|
+ "\nhtmlExtensions:\t" + fromSet(htmlExtensions)
|
||||||
|
+ "\nimageExtensions:\t" + fromSet(imageExtensions)
|
||||||
|
+ "\nrefreshHTMLs:\t" + refreshHTMLs
|
||||||
|
+ "\nrefreshImages:\t" + refreshImages
|
||||||
|
+ "\nrefreshOthers:\t" + refreshOthers
|
||||||
|
+ "\nsaveRootDirectory:\t" + saveRootDirectory
|
||||||
|
+ "\nstartLocation:\t" + startLocation
|
||||||
|
+ "\nurlMatch:\t" + urlMatch
|
||||||
|
+ "\nuserAgent:\t" + userAgent
|
||||||
|
+ "\nbasicAuthUser:\t" + basicAuthUser
|
||||||
|
+ "\nbasicAuthPassword:\t" + "***"
|
||||||
|
+ "\nspiderThreads:\t" + spiderThreads
|
||||||
|
+ "\ncheckpointInterval:\t" + checkpointInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set parseSet(String str)
|
||||||
|
{
|
||||||
|
LechLogger.debug("parseSet(" + str + ")");
|
||||||
|
HashSet result = new HashSet();
|
||||||
|
StringTokenizer sTok = new StringTokenizer(str, ",");
|
||||||
|
while(sTok.hasMoreTokens())
|
||||||
|
{
|
||||||
|
String tok = sTok.nextToken().trim();
|
||||||
|
result.add(tok);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String fromSet(Set s)
|
||||||
|
{
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
boolean first = true;
|
||||||
|
for(Iterator i = s.iterator(); i.hasNext(); )
|
||||||
|
{
|
||||||
|
String str = (String) i.next();
|
||||||
|
if(first)
|
||||||
|
{
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sb.append(",");
|
||||||
|
}
|
||||||
|
sb.append(str);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // End class SpiderConfig
|
|
@ -0,0 +1,138 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
* Copyright (c) 2004 Andrew Coleman
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.Authenticator;
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
import weblech.ui.LechLogger;
|
||||||
|
|
||||||
|
public class URLGetter
|
||||||
|
{
|
||||||
|
private int failureCount = 0;
|
||||||
|
|
||||||
|
private final SpiderConfig config;
|
||||||
|
|
||||||
|
public URLGetter(SpiderConfig config)
|
||||||
|
{
|
||||||
|
LechLogger.debug("URLGetter()");
|
||||||
|
this.config = config;
|
||||||
|
|
||||||
|
Authenticator.setDefault(new DumbAuthenticator(config.getBasicAuthUser(), config.getBasicAuthPassword()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public URLObject getURL(URLToDownload url)
|
||||||
|
{
|
||||||
|
LechLogger.debug("getURL(" + url + ")");
|
||||||
|
|
||||||
|
if(failureCount > 10)
|
||||||
|
{
|
||||||
|
LechLogger.warn("Lots of failures recently, waiting 5 seconds before attempting download");
|
||||||
|
try { Thread.sleep(5 * 1000); } catch(InterruptedException e) { };
|
||||||
|
failureCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
URL requestedURL = url.getURL();
|
||||||
|
URL referer = url.getReferer();
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
LechLogger.debug("Creating HTTP connection to " + requestedURL);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) requestedURL.openConnection();
|
||||||
|
if(referer != null)
|
||||||
|
{
|
||||||
|
LechLogger.debug("Setting Referer header to " + referer);
|
||||||
|
conn.setRequestProperty("Referer", referer.toExternalForm());
|
||||||
|
}
|
||||||
|
|
||||||
|
if(config.getUserAgent() != null)
|
||||||
|
{
|
||||||
|
LechLogger.debug("Setting User-Agent to " + config.getUserAgent());
|
||||||
|
conn.setRequestProperty("User-Agent", config.getUserAgent());
|
||||||
|
}
|
||||||
|
|
||||||
|
conn.setUseCaches(false);
|
||||||
|
|
||||||
|
LechLogger.debug("Opening URL");
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
conn.connect();
|
||||||
|
|
||||||
|
String resp = conn.getResponseMessage();
|
||||||
|
LechLogger.debug("Remote server response: " + resp);
|
||||||
|
|
||||||
|
String respStr = conn.getHeaderField(0);
|
||||||
|
LechLogger.info("Server response: " + respStr);
|
||||||
|
|
||||||
|
for(int i = 1; ; i++)
|
||||||
|
{
|
||||||
|
String key = conn.getHeaderFieldKey(i);
|
||||||
|
if(key == null)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
String value = conn.getHeaderField(key);
|
||||||
|
LechLogger.debug("Received header " + key + ": " + value);
|
||||||
|
}
|
||||||
|
|
||||||
|
LechLogger.debug("Getting buffered input stream from remote connection");
|
||||||
|
BufferedInputStream remoteBIS = new BufferedInputStream(conn.getInputStream());
|
||||||
|
ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
|
||||||
|
byte[] buf = new byte[1024];
|
||||||
|
int bytesRead = 0;
|
||||||
|
while(bytesRead >= 0)
|
||||||
|
{
|
||||||
|
baos.write(buf, 0, bytesRead);
|
||||||
|
bytesRead = remoteBIS.read(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] content = baos.toByteArray();
|
||||||
|
long timeTaken = System.currentTimeMillis() - startTime;
|
||||||
|
if(timeTaken < 100) timeTaken = 500;
|
||||||
|
|
||||||
|
int bytesPerSec = (int) ((double) content.length / ((double)timeTaken / 1000.0));
|
||||||
|
LechLogger.info("Downloaded " + content.length + " bytes, " + bytesPerSec + " bytes/sec");
|
||||||
|
if(content.length < conn.getContentLength())
|
||||||
|
{
|
||||||
|
LechLogger.warn("Didn't download full content for URL: " + url);
|
||||||
|
failureCount++;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new URLObject(requestedURL, conn.getContentType(), content, config);
|
||||||
|
}
|
||||||
|
catch(FileNotFoundException fnfe) {
|
||||||
|
LechLogger.warn("File not found: " + fnfe.getMessage());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
catch(IOException ioe)
|
||||||
|
{
|
||||||
|
LechLogger.warn("Caught IO Exception: " + ioe.getMessage(), ioe);
|
||||||
|
failureCount++;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,206 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
* Copyright (c) 2004 Andrew Coleman
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.URLEncoder;
|
||||||
|
|
||||||
|
import weblech.ui.LechLogger;
|
||||||
|
|
||||||
|
public class URLObject
|
||||||
|
{
|
||||||
|
private final URL sourceURL;
|
||||||
|
private final String contentType;
|
||||||
|
private final byte[] content;
|
||||||
|
|
||||||
|
private final SpiderConfig config;
|
||||||
|
|
||||||
|
public URLObject(URL sourceURL, String contentType, byte[] content, SpiderConfig config)
|
||||||
|
{
|
||||||
|
this.sourceURL = sourceURL;
|
||||||
|
this.contentType = contentType;
|
||||||
|
this.content = content;
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
public URLObject(URL sourceURL, SpiderConfig config)
|
||||||
|
{
|
||||||
|
this.sourceURL = sourceURL;
|
||||||
|
this.config = config;
|
||||||
|
|
||||||
|
String s = sourceURL.toExternalForm().toLowerCase();
|
||||||
|
if(s.indexOf(".jpg") != -1)
|
||||||
|
{
|
||||||
|
contentType = "image/jpeg";
|
||||||
|
}
|
||||||
|
else if(s.indexOf(".gif") != -1)
|
||||||
|
{
|
||||||
|
contentType = "image/gif";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
contentType = "text/html";
|
||||||
|
}
|
||||||
|
|
||||||
|
if(existsOnDisk())
|
||||||
|
{
|
||||||
|
|
||||||
|
File f = new File(convertToFileName());
|
||||||
|
if(f.isDirectory())
|
||||||
|
{
|
||||||
|
f = new File(f, "index.html");
|
||||||
|
}
|
||||||
|
content = new byte[(int) f.length()];
|
||||||
|
try
|
||||||
|
{
|
||||||
|
FileInputStream in = new FileInputStream(f);
|
||||||
|
in.read(content);
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
|
catch(IOException ioe)
|
||||||
|
{
|
||||||
|
LechLogger.warn("IO Exception reading disk version of URL " + sourceURL, ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
content = new byte[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getContentType()
|
||||||
|
{
|
||||||
|
return contentType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isHTML()
|
||||||
|
{
|
||||||
|
return contentType.toLowerCase().startsWith("text/html");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isXML()
|
||||||
|
{
|
||||||
|
return contentType.toLowerCase().startsWith("text/xml");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isImage()
|
||||||
|
{
|
||||||
|
return contentType.startsWith("image/");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getStringContent()
|
||||||
|
{
|
||||||
|
return new String(content);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String convertToFileName()
|
||||||
|
{
|
||||||
|
String url = sourceURL.toExternalForm();
|
||||||
|
int httpIdx = url.indexOf("http://");
|
||||||
|
if(httpIdx == 0)
|
||||||
|
{
|
||||||
|
url = url.substring(7);
|
||||||
|
}
|
||||||
|
// Check for at least one slash -- otherwise host name (e.g. sourceforge.net)
|
||||||
|
if(url.indexOf("/") < 0)
|
||||||
|
{
|
||||||
|
url = url + "/";
|
||||||
|
}
|
||||||
|
// If trailing slash, add index.html as default
|
||||||
|
if(url.endsWith("/"))
|
||||||
|
{
|
||||||
|
url = url + "index.html";
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
/* the old encode method is now deprecated, updated to the new API -- Coleman */
|
||||||
|
url = textReplace("?", URLEncoder.encode("?","UTF-8"), url);
|
||||||
|
url = textReplace("&", URLEncoder.encode("&","UTF-8"), url);
|
||||||
|
}
|
||||||
|
catch ( java.io.UnsupportedEncodingException exception ) {
|
||||||
|
LechLogger.error ( exception.toString() );
|
||||||
|
}
|
||||||
|
return config.getSaveRootDirectory().getPath() + "/" + url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean existsOnDisk()
|
||||||
|
{
|
||||||
|
File f = new File(convertToFileName());
|
||||||
|
return (f.exists() && !f.isDirectory());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeToFile()
|
||||||
|
{
|
||||||
|
writeToFile(convertToFileName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeToFile(String fileName)
|
||||||
|
{
|
||||||
|
LechLogger.debug("writeToFile(" + fileName + ")");
|
||||||
|
try
|
||||||
|
{
|
||||||
|
File f = new File(fileName);
|
||||||
|
f.getParentFile().mkdirs();
|
||||||
|
FileOutputStream out = new FileOutputStream(fileName);
|
||||||
|
out.write(content);
|
||||||
|
out.flush();
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
catch(IOException ioe)
|
||||||
|
{
|
||||||
|
LechLogger.warn("IO Exception writing to " + fileName, ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString()
|
||||||
|
{
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append("URLObject: ");
|
||||||
|
sb.append(contentType);
|
||||||
|
if(false)//isHTML() || isXML())
|
||||||
|
{
|
||||||
|
sb.append("\n");
|
||||||
|
sb.append(getStringContent());
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String textReplace(String find, String replace, String input)
|
||||||
|
{
|
||||||
|
int startPos = 0;
|
||||||
|
while(true)
|
||||||
|
{
|
||||||
|
int textPos = input.indexOf(find, startPos);
|
||||||
|
if(textPos < 0)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
input = input.substring(0, textPos) + replace + input.substring(textPos + find.length());
|
||||||
|
startPos = textPos + replace.length();
|
||||||
|
}
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
/*
|
||||||
|
* This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 Brian Pitcher
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// $Header: /home/andrew/Projects/penguincoder/cvs/WebLech/weblech/spider/URLToDownload.java,v 1.1 2004/03/07 20:51:05 mercury Exp $
|
||||||
|
|
||||||
|
package weblech.spider;
|
||||||
|
|
||||||
|
import java.net.URL;
|
||||||
|
|
||||||
|
public class URLToDownload implements java.io.Serializable
|
||||||
|
{
|
||||||
|
private final URL url;
|
||||||
|
private final URL referer;
|
||||||
|
private final int depth;
|
||||||
|
|
||||||
|
public URLToDownload(URL url, int depth)
|
||||||
|
{
|
||||||
|
this(url, null, depth);
|
||||||
|
}
|
||||||
|
|
||||||
|
public URLToDownload(URL url, URL referer, int depth)
|
||||||
|
{
|
||||||
|
this.url = url;
|
||||||
|
this.referer = referer;
|
||||||
|
this.depth = depth;
|
||||||
|
}
|
||||||
|
|
||||||
|
public URL getURL()
|
||||||
|
{
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public URL getReferer()
|
||||||
|
{
|
||||||
|
return referer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getDepth()
|
||||||
|
{
|
||||||
|
return depth;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString()
|
||||||
|
{
|
||||||
|
return url + ", referer " + referer + ", depth " + depth;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,138 @@
|
||||||
|
/**
|
||||||
|
* LechLogger.java: A Graphical Logger
|
||||||
|
* The original weblech had a simple text interface and used an apache.org library for logging.
|
||||||
|
* When i wanted to make this thing into a graphical appk, I realized the text logging
|
||||||
|
* was going to have to go. It assumes you only want to log to one location (not a problem for
|
||||||
|
* this application).
|
||||||
|
*/
|
||||||
|
package weblech.ui;
|
||||||
|
|
||||||
|
import javax.swing.JTextArea;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class LechLogger {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The actual text area that will perform all output.
|
||||||
|
*/
|
||||||
|
private static JTextArea _loggerWindow;
|
||||||
|
/**
|
||||||
|
* These are flags for enabling different types of logging mechanisms.
|
||||||
|
*/
|
||||||
|
private static boolean error_enable, warn_enable, info_enable, debug_enable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only need one initialization since this will be shared between many different
|
||||||
|
* objects.
|
||||||
|
*/
|
||||||
|
static {
|
||||||
|
_loggerWindow = null;
|
||||||
|
error_enable = true;
|
||||||
|
warn_enable = true;
|
||||||
|
info_enable = true;
|
||||||
|
debug_enable = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Everybody wants to log, but you only need one logger! */
|
||||||
|
public LechLogger() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the textual component to perform the logging.
|
||||||
|
*/
|
||||||
|
public static void setTextArea ( JTextArea textarea ) {
|
||||||
|
_loggerWindow = textarea;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A private method for actually writing the messages.
|
||||||
|
* It is synchronized because the weblech spider is multi
|
||||||
|
* threaded.
|
||||||
|
*/
|
||||||
|
private static synchronized void log ( String msg ) {
|
||||||
|
if ( _loggerWindow == null ) {
|
||||||
|
//System.out.println ( msg );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_loggerWindow.setEditable ( true );
|
||||||
|
_loggerWindow.append ( msg );
|
||||||
|
_loggerWindow.append ( "\n" );
|
||||||
|
_loggerWindow.setEditable ( false );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Toggle error logging.
|
||||||
|
*/
|
||||||
|
public static void setErrorLogging() {
|
||||||
|
error_enable = !error_enable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log an error message.
|
||||||
|
*/
|
||||||
|
public static void error ( String msg ) {
|
||||||
|
if ( !error_enable ) return;
|
||||||
|
log ( "*error>" + msg );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log an error message and an exception.
|
||||||
|
*/
|
||||||
|
public static void error ( String msg, Exception exception ) {
|
||||||
|
if ( !error_enable ) return;
|
||||||
|
log ( "*error>" + msg + "\n" + exception.getMessage() );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Toggle informational messages.
|
||||||
|
*/
|
||||||
|
public static void setInformationalLogging() {
|
||||||
|
info_enable = !info_enable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log an informational message.
|
||||||
|
*/
|
||||||
|
public static void info ( String msg ) {
|
||||||
|
if ( !info_enable ) return;
|
||||||
|
log ( "^info>" + msg );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Toggle warning messages.
|
||||||
|
*/
|
||||||
|
public static void setWarningLogging() {
|
||||||
|
warn_enable = !warn_enable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log a warning message.
|
||||||
|
*/
|
||||||
|
public static void warn ( String msg ) {
|
||||||
|
if ( !warn_enable ) return;
|
||||||
|
log ( "-warn>" + msg );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log a warning message with an exception.
|
||||||
|
*/
|
||||||
|
public static void warn ( String msg, IOException exception ) {
|
||||||
|
if ( !warn_enable ) return;
|
||||||
|
log ( "-warn>" + msg + "\n" + exception.getMessage() );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Toggle debug messages to be printed.
|
||||||
|
*/
|
||||||
|
public static void setDebugLogging() {
|
||||||
|
debug_enable = !debug_enable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log a deubgging statement to the logging text area.
|
||||||
|
*/
|
||||||
|
public static void debug ( String msg ) {
|
||||||
|
if ( !debug_enable ) return;
|
||||||
|
log ( "@debug>" + msg );
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,251 @@
|
||||||
|
/**
|
||||||
|
* SpiderConfigPanel.java: A graphcial panel for configuring a SpiderConfig object.
|
||||||
|
* This panel provides a way to change the more practical options of the WebLech Spider.
|
||||||
|
* It supports saving and opening of SpiderConfigurations from a file. It does not use
|
||||||
|
* any of the "interesting" or "boring" url features, or the email link save file.
|
||||||
|
*/
|
||||||
|
package weblech.ui;
|
||||||
|
|
||||||
|
import weblech.spider.Spider;
|
||||||
|
import weblech.spider.SpiderConfig;
|
||||||
|
|
||||||
|
import javax.swing.JPanel;
|
||||||
|
import javax.swing.JTextField;
|
||||||
|
import javax.swing.JLabel;
|
||||||
|
import javax.swing.JButton;
|
||||||
|
import javax.swing.JComboBox;
|
||||||
|
import javax.swing.JFileChooser;
|
||||||
|
|
||||||
|
import java.awt.GridLayout;
|
||||||
|
import java.awt.FlowLayout;
|
||||||
|
|
||||||
|
import java.awt.event.ActionListener;
|
||||||
|
import java.awt.event.ActionEvent;
|
||||||
|
|
||||||
|
import java.util.Properties;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.ObjectInputStream;
|
||||||
|
import java.io.ObjectOutputStream;
|
||||||
|
|
||||||
|
public class SpiderConfigPanel extends JPanel implements ActionListener {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A list of all of the spiders that the GUI will have downloading. It is assumed
|
||||||
|
* that the user knows how much bandwidth you have and really wants to try and get
|
||||||
|
* several different sites at the same time.
|
||||||
|
*/
|
||||||
|
private ArrayList spiders;
|
||||||
|
/**
|
||||||
|
* Various text fields for the configuration options.
|
||||||
|
*/
|
||||||
|
private JTextField sitenamefield, dirfield, usernamefield, passwordfield, agentfield, depthfield, matchfield;
|
||||||
|
/**
|
||||||
|
* A selection box for the number of threads a new Spider should use, I am limiting
|
||||||
|
* the number of threads to 4 for simplicity.
|
||||||
|
*/
|
||||||
|
private JComboBox threadbox;
|
||||||
|
|
||||||
|
public SpiderConfigPanel() {
|
||||||
|
super ( new GridLayout ( 8, 1 ) );
|
||||||
|
spiders = new ArrayList();
|
||||||
|
|
||||||
|
/* Panel for the directory to save all files */
|
||||||
|
JPanel sitepanel = new JPanel();
|
||||||
|
((FlowLayout) sitepanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||||
|
JLabel sitelabel = new JLabel ( "Output Directory:" );
|
||||||
|
dirfield = new JTextField ( System.getProperty ( "user.home" ), 20 );
|
||||||
|
sitepanel.add ( sitelabel );
|
||||||
|
sitepanel.add ( dirfield );
|
||||||
|
|
||||||
|
/* Panel for the site to download */
|
||||||
|
JPanel outputpanel = new JPanel();
|
||||||
|
((FlowLayout) outputpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||||
|
JLabel dirlabel = new JLabel ( "Download Site:" );
|
||||||
|
sitenamefield = new JTextField ( "http://www.google.com/", 20 );
|
||||||
|
outputpanel.add ( dirlabel );
|
||||||
|
outputpanel.add ( sitenamefield );
|
||||||
|
|
||||||
|
/* Panel for the HTTP username */
|
||||||
|
JPanel usernamepanel = new JPanel();
|
||||||
|
((FlowLayout) usernamepanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||||
|
JLabel usernamelabel = new JLabel ( "Username:" );
|
||||||
|
usernamefield = new JTextField ( "", 20 );
|
||||||
|
usernamepanel.add ( usernamelabel );
|
||||||
|
usernamepanel.add ( usernamefield );
|
||||||
|
|
||||||
|
/* Panel for the HTTP password */
|
||||||
|
JPanel passpanel = new JPanel();
|
||||||
|
((FlowLayout) passpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||||
|
JLabel passwdlabel = new JLabel ( "Password:" );
|
||||||
|
passwordfield = new JTextField ( "", 20 );
|
||||||
|
passpanel.add ( passwdlabel );
|
||||||
|
passpanel.add ( passwordfield );
|
||||||
|
|
||||||
|
/* Panel for the HTTP user agent */
|
||||||
|
JPanel agentpanel = new JPanel();
|
||||||
|
((FlowLayout) agentpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||||
|
JLabel agentlabel = new JLabel ( "User Agent:" );
|
||||||
|
agentfield = new JTextField ( "WebLech [Version C]", 20 );
|
||||||
|
agentpanel.add ( agentlabel );
|
||||||
|
agentpanel.add ( agentfield );
|
||||||
|
|
||||||
|
/* Panel for a simple string match downloading limiter (no match, no download) */
|
||||||
|
JPanel matchpanel = new JPanel();
|
||||||
|
((FlowLayout) matchpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||||
|
JLabel matchlabel = new JLabel ( "Match String:" );
|
||||||
|
matchfield = new JTextField ( "", 20 );
|
||||||
|
matchpanel.add ( matchlabel );
|
||||||
|
matchpanel.add ( matchfield );
|
||||||
|
|
||||||
|
/* Provides a panel for placing both the maximum depth and threads for this spider */
|
||||||
|
JPanel detailpanel = new JPanel ( new GridLayout ( 1, 2 ) );
|
||||||
|
|
||||||
|
JPanel depthpanel = new JPanel();
|
||||||
|
((FlowLayout) depthpanel.getLayout()).setAlignment ( FlowLayout.RIGHT );
|
||||||
|
JLabel depthlabel = new JLabel ( "Max Depth:" );
|
||||||
|
depthfield = new JTextField ( Integer.toString ( 0 ), 5 );
|
||||||
|
depthpanel.add ( depthlabel );
|
||||||
|
depthpanel.add ( depthfield );
|
||||||
|
|
||||||
|
JPanel threadpanel = new JPanel();
|
||||||
|
JLabel threadlabel = new JLabel ( "Spider Threads:" );
|
||||||
|
Integer[] threaditems = { new Integer ( "1" ), new Integer ( "2" ), new Integer ( "3" ), new Integer ( "4" ) };
|
||||||
|
threadbox = new JComboBox ( threaditems );
|
||||||
|
threadpanel.add ( threadlabel );
|
||||||
|
threadpanel.add ( threadbox );
|
||||||
|
|
||||||
|
detailpanel.add ( depthpanel );
|
||||||
|
detailpanel.add ( threadpanel );
|
||||||
|
|
||||||
|
/* Panel of buttons for various operations */
|
||||||
|
JPanel buttonpanel = new JPanel();
|
||||||
|
JButton save = new JButton ( "Save" );
|
||||||
|
JButton spiderbutton = new JButton ( "Spider It" );
|
||||||
|
JButton open = new JButton ( "Open" );
|
||||||
|
JButton qbutton = new JButton ( "Quit" );
|
||||||
|
buttonpanel.add ( save );
|
||||||
|
buttonpanel.add ( spiderbutton );
|
||||||
|
buttonpanel.add ( open );
|
||||||
|
buttonpanel.add ( qbutton );
|
||||||
|
|
||||||
|
add ( sitepanel );
|
||||||
|
add ( outputpanel );
|
||||||
|
add ( usernamepanel );
|
||||||
|
add ( passpanel );
|
||||||
|
add ( agentpanel );
|
||||||
|
add ( matchpanel );
|
||||||
|
add ( detailpanel );
|
||||||
|
add ( buttonpanel );
|
||||||
|
|
||||||
|
/* Configure the button actions */
|
||||||
|
save.setActionCommand ( "save" );
|
||||||
|
open.setActionCommand ( "open" );
|
||||||
|
spiderbutton.setActionCommand ( "spider" );
|
||||||
|
qbutton.setActionCommand ( "quit" );
|
||||||
|
save.addActionListener ( this );
|
||||||
|
open.addActionListener ( this );
|
||||||
|
spiderbutton.addActionListener ( this );
|
||||||
|
qbutton.addActionListener ( this );
|
||||||
|
LechLogger.debug ( "Actions" );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method will create a Properties object good for instantiating a new SpiderConfig
|
||||||
|
* Object.
|
||||||
|
*/
|
||||||
|
private Properties createProperties() {
|
||||||
|
Properties p = new Properties();
|
||||||
|
p.setProperty ( "saveRootDirectory", dirfield.getText() );
|
||||||
|
p.setProperty ( "startLocation", sitenamefield.getText() );
|
||||||
|
p.setProperty ( "basicAuthUser", usernamefield.getText() );
|
||||||
|
p.setProperty ( "basicAuthPassword", passwordfield.getText() );
|
||||||
|
p.setProperty ( "urlMatch", matchfield.getText() );
|
||||||
|
p.setProperty ( "spiderThreads", ((Integer) threadbox.getSelectedItem()).toString() );
|
||||||
|
p.setProperty ( "maxDepth", depthfield.getText() );
|
||||||
|
p.setProperty ( "userAgent", agentfield.getText() );
|
||||||
|
p.setProperty ( "interestingURLs", "" );
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method will extract all of the values from a SpiderConfig object that the GUI uses
|
||||||
|
* and updates the panel to show the values in the object.
|
||||||
|
*/
|
||||||
|
private void setSpiderConfig ( SpiderConfig sc ) {
|
||||||
|
dirfield.setText ( sc.getSaveRootDirectory().toString() );
|
||||||
|
sitenamefield.setText ( sc.getStartLocation().toString() );
|
||||||
|
usernamefield.setText ( sc.getBasicAuthUser() );
|
||||||
|
passwordfield.setText ( sc.getBasicAuthPassword() );
|
||||||
|
matchfield.setText ( sc.getURLMatch() );
|
||||||
|
int t = sc.getSpiderThreads();
|
||||||
|
if ( t < 1 || t > 4 ) {
|
||||||
|
t = 1;
|
||||||
|
sc.setSpiderThreads ( t );
|
||||||
|
}
|
||||||
|
threadbox.setSelectedIndex ( t - 1 );
|
||||||
|
depthfield.setText ( Integer.toString ( sc.getMaxDepth() ) );
|
||||||
|
agentfield.setText ( sc.getUserAgent() );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method will coordinate all of the actions for the various buttons used.
|
||||||
|
*/
|
||||||
|
public void actionPerformed ( ActionEvent event ) {
|
||||||
|
String cmd = event.getActionCommand();
|
||||||
|
/* Download a new site */
|
||||||
|
if ( cmd.equals ( "spider" ) ) {
|
||||||
|
SpiderConfig c = new SpiderConfig ( createProperties() );
|
||||||
|
Spider spider = new Spider ( c );
|
||||||
|
/* But only if we are not already downloading the site */
|
||||||
|
if ( spiders.contains ( spider ) ) {
|
||||||
|
LechLogger.warn ( "Already have an instance of a Spider at " + c.getStartLocation() );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
spiders.add ( spider );
|
||||||
|
spider.start();
|
||||||
|
}
|
||||||
|
/* Save the current configuration to a file */
|
||||||
|
else if ( cmd.equals ( "save" ) ) {
|
||||||
|
JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) );
|
||||||
|
int r = f.showSaveDialog ( this );
|
||||||
|
if ( r != JFileChooser.APPROVE_OPTION ) return;
|
||||||
|
File outfile = f.getSelectedFile();
|
||||||
|
try {
|
||||||
|
ObjectOutputStream os = new ObjectOutputStream ( new FileOutputStream ( outfile ) );
|
||||||
|
os.writeObject ( new SpiderConfig ( createProperties() ) );
|
||||||
|
os.close();
|
||||||
|
}
|
||||||
|
catch ( Exception exception ) {
|
||||||
|
LechLogger.error ( exception.toString() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Open a saved configuration from a file */
|
||||||
|
else if ( cmd.equals ( "open" ) ) {
|
||||||
|
JFileChooser f = new JFileChooser ( System.getProperty ( "user.home" ) );
|
||||||
|
int r = f.showOpenDialog ( this );
|
||||||
|
if ( r != JFileChooser.APPROVE_OPTION ) return;
|
||||||
|
File infile = f.getSelectedFile();
|
||||||
|
if ( !infile.canRead() ) {
|
||||||
|
LechLogger.error ( "file " + f.toString() + " is not readable" );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ObjectInputStream os = new ObjectInputStream ( new FileInputStream ( infile ) );
|
||||||
|
SpiderConfig sc = (SpiderConfig) os.readObject();
|
||||||
|
os.close();
|
||||||
|
setSpiderConfig ( sc );
|
||||||
|
}
|
||||||
|
catch ( Exception exception ) {
|
||||||
|
LechLogger.error ( exception.toString() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Just quit */
|
||||||
|
else if ( cmd.equals ( "quit" ) ) {
|
||||||
|
System.exit ( 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,113 @@
|
||||||
|
/**
|
||||||
|
* Troll.java: A user interface to the weblech spider download utility.
|
||||||
|
*/
|
||||||
|
package weblech.ui;
|
||||||
|
|
||||||
|
/* I like to explicitly import all of my packages to remind me to KISS */
|
||||||
|
import javax.swing.JFrame;
|
||||||
|
import javax.swing.JPanel;
|
||||||
|
import javax.swing.JTabbedPane;
|
||||||
|
import javax.swing.JTextArea;
|
||||||
|
import javax.swing.JScrollPane;
|
||||||
|
import javax.swing.JMenuBar;
|
||||||
|
import javax.swing.JMenu;
|
||||||
|
import javax.swing.JMenuItem;
|
||||||
|
import javax.swing.JCheckBoxMenuItem;
|
||||||
|
|
||||||
|
import java.awt.Dimension;
|
||||||
|
|
||||||
|
import java.awt.event.ActionListener;
|
||||||
|
import java.awt.event.ActionEvent;
|
||||||
|
|
||||||
|
public class Troll extends JFrame implements ActionListener {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This SpiderConfigPanel is a custom panel that provides many of the more
|
||||||
|
* practical features of the weblech spider. It also controls the spiders
|
||||||
|
* created by the user.
|
||||||
|
*/
|
||||||
|
private SpiderConfigPanel configpanel;
|
||||||
|
/**
|
||||||
|
* This is the area that all of the logging facilities will use. This makes debugging
|
||||||
|
* in a system like Max OS X much simpler (:^)
|
||||||
|
*/
|
||||||
|
private static JTextArea logarea;
|
||||||
|
|
||||||
|
/* This just initializes the logging text box and readies it for recording events before
|
||||||
|
* the rest of the object is even loaded.
|
||||||
|
*/
|
||||||
|
static {
|
||||||
|
logarea = new JTextArea();
|
||||||
|
LechLogger.setTextArea ( logarea );
|
||||||
|
LechLogger.setDebugLogging();
|
||||||
|
}
|
||||||
|
|
||||||
|
Troll() {
|
||||||
|
super ( "Troll" );
|
||||||
|
Dimension initialsize = new Dimension ( 400, 375 );
|
||||||
|
setSize ( initialsize);
|
||||||
|
|
||||||
|
/* Create a menubar for controlling which aspects of the log you wish to see */
|
||||||
|
JMenuBar menubar = new JMenuBar();
|
||||||
|
JMenu logmenu = new JMenu ( "Log Options" );
|
||||||
|
JCheckBoxMenuItem showdebug = new JCheckBoxMenuItem ( "Show Debug Messages", false );
|
||||||
|
showdebug.setActionCommand ( "debug" );
|
||||||
|
showdebug.addActionListener ( this );
|
||||||
|
JCheckBoxMenuItem showinfo = new JCheckBoxMenuItem ( "Show Informational Messages", true );
|
||||||
|
showinfo.setActionCommand ( "info" );
|
||||||
|
showinfo.addActionListener ( this );
|
||||||
|
JCheckBoxMenuItem showwarn = new JCheckBoxMenuItem ( "Show Warnings", true );
|
||||||
|
showwarn.setActionCommand ( "warn" );
|
||||||
|
showwarn.addActionListener ( this );
|
||||||
|
JCheckBoxMenuItem showerror = new JCheckBoxMenuItem ( "Show Errors", true );
|
||||||
|
showerror.setActionCommand ( "error" );
|
||||||
|
showerror.addActionListener ( this );
|
||||||
|
logmenu.add ( showdebug );
|
||||||
|
logmenu.add ( showinfo );
|
||||||
|
logmenu.add ( showwarn );
|
||||||
|
logmenu.add ( showerror );
|
||||||
|
menubar.add ( logmenu );
|
||||||
|
|
||||||
|
/* A simple tab interface between configuration and error checking */
|
||||||
|
configpanel = new SpiderConfigPanel();
|
||||||
|
JPanel logpanel = new JPanel();
|
||||||
|
logpanel.add ( logarea );
|
||||||
|
JScrollPane logscroller = new JScrollPane ( logpanel );
|
||||||
|
JTabbedPane tabs = new JTabbedPane();
|
||||||
|
tabs.addTab ( "Spider", configpanel );
|
||||||
|
tabs.addTab ( "Log", logscroller );
|
||||||
|
|
||||||
|
/* Configure the JFrame to a usable state */
|
||||||
|
setJMenuBar ( menubar );
|
||||||
|
getContentPane().add ( tabs );
|
||||||
|
setLocationRelativeTo ( null );
|
||||||
|
setVisible ( true );
|
||||||
|
setDefaultCloseOperation ( JFrame.EXIT_ON_CLOSE );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method basically toggles all of the logging options.
|
||||||
|
*/
|
||||||
|
public void actionPerformed ( ActionEvent event ) {
|
||||||
|
String cmd = event.getActionCommand();
|
||||||
|
if ( cmd.equals ( "debug" ) ) {
|
||||||
|
LechLogger.setDebugLogging();
|
||||||
|
}
|
||||||
|
else if ( cmd.equals ( "info" ) ) {
|
||||||
|
LechLogger.setInformationalLogging();
|
||||||
|
}
|
||||||
|
else if ( cmd.equals ( "warn" ) ) {
|
||||||
|
LechLogger.setDebugLogging();
|
||||||
|
}
|
||||||
|
else if ( cmd.equals ( "error" ) ) {
|
||||||
|
LechLogger.setErrorLogging();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new troll and go
|
||||||
|
*/
|
||||||
|
public static void main ( String[] args ) {
|
||||||
|
Troll t = new Troll();
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue