skip to main |
skip to sidebar
RSS Feeds
ASP.NET, JavaScript, Oracle and SQL articles and code.
ASP.NET, JavaScript, Oracle and SQL articles and code.
1:19 PM
Posted by Michael Heliso
The purpose of this article is to describe how you can implement a generic “HTTP Data Client” (I apologize if it sounds fussy) using C# which would allow you to query in an elegant manner any web based resources you would like (in one word it allows you to do web scraping). I would like to mention from the beginning that this is not “the perfect solution” and that for sure it can be improved in many ways, so, please feel free to do so. The entire concept it is based on HTTPWebRequest object offered by .NET under System.Net name space.
Before I’ll start to dwell into architecture and code, there are some extra libraries which are used and required by the “HTTP Data Client” project.
Here is the list of libraries:
My solution is contains four projects:
The purpose of HDPAdapter class is to allow integration of XML data to with other data objects as DataTable and DataSet. The IHDPAdapter interface exposes two methods which convert XML data into either a DataTable or a DataSet. At current time only DataTable conversion method is implemented. Here is the code snippet for the interface and class:
IHDPAdapter Code
using System.Data;
using System.Xml;
namespace HttpData.Client
{
/// <summary>
/// Provides functionality for integration with data objects (DataTable, DataSet, etc). Is implemented by HDPAdapter.
/// </summary>
public interface IHDPAdapter
{
#region
/// <summary>
/// Get or set the select HDPCommand object.
/// </summary>
IHDPCommand SelectCommand{ get; set; }
#endregion
#region METHODS
/// <summary>
/// Fill a data table with the content from a specified xml document object.
/// </summary>
/// <param name="table">Data table to be filled.</param>
/// <param name="source">Xml document object of which content will fill the data table.</param>
/// <param name="useNodes">True if nodes names should be used for columns names, otherwise attributes will be used.</param>
/// <returns>Number of filled rows.</returns>
int Fill(DataTable table, XmlDocument source, bool useNodes);
/// <summary>
/// (NOT IMPLEMENTED) Fill a data set with the content from a specified xml document object.
/// </summary>
/// <param name="dataset">Data set to be filled.</param>
/// <param name="source">Xml document object of which content will fill the data table.</param>
/// <param name="useNodes">True if nodes names should be used for columns names, otherwise attributes will be used.</param>
/// <returns>Number of filled rows.</returns>
int Fill(DataSet dataset, XmlDocument source, bool useNodes);
#endregion
}
}
using System;
using System.Xml;
using System.Xml.XPath;
using System.Data;
using System.Text;
namespace HttpData.Client
{
/// <summary>
/// Provides functionality for integration with data objects (DataTable, DataSet, etc).
/// </summary>
public class HDPAdapter : IHDPAdapter
{
#region PRIVATE VARIABLES
private IHDPCommand _selectCommand;
#endregion
#region Properties
/// <summary>
/// Get or set the select IHDPCommand object.
/// </summary>
IHDPCommand IHDPAdapter.SelectCommand
{
get{ return _selectCommand; }
set{ _selectCommand = value; }
}
/// <summary>
/// Get or set the select HDPCommand object.
/// </summary>
public HDPCommand SelectCommand
{
get{ return (HDPCommand)_selectCommand; }
set{ _selectCommand = value; }
}
/// <summary>
/// Get or set the connection string.
/// </summary>
public string ConnectionString { get; set; }
#endregion
#region .ctor
/// <summary>
/// Create a new instance of HDPAdapter.
/// </summary>
public HDPAdapter()
{
}
/// <summary>
/// Create a new instance of HDPAdapter.
/// </summary>
/// <param name="connectionString">Connection string associated with HDPAdapter object.</param>
public HDPAdapter(string connectionString)
{
this.ConnectionString = connectionString;
}
#endregion
#region Public Methods
/// <summary>
/// Fill a data table with the content from a specified xml document object.
/// </summary>
/// <param name="table">Data table to be filled.</param>
/// <param name="source">Xml document object of which content will fill the data table.</param>
/// <param name="useNodes">True if nodes names should be used for columns names, otherwise attributes will be used.</param>
/// <returns>Number of filled rows.</returns>
public int Fill(DataTable table, XmlDocument source, bool useNodes)
{
bool columnsCreated = false;
bool resetRow = false;
if(table == null || source == null)
return 0;
if (table.TableName.Length == 0)
return 0;
StringBuilder sbExpression = new StringBuilder("//");
sbExpression.Append(table.TableName);
XPathNavigator xpNav = source.CreateNavigator();
if (xpNav != null)
{
XPathNodeIterator xniNode = xpNav.Select(sbExpression.ToString());
while(xniNode.MoveNext())
{
XPathNodeIterator xniRowNode = xniNode.Current.SelectChildren(XPathNodeType.Element);
while (xniRowNode.MoveNext())
{
if(resetRow)
{
xniRowNode.Current.MoveToFirst();
resetRow = false;
}
DataRow row = null;
if (columnsCreated)
row = table.NewRow();
if(useNodes)
{
XPathNodeIterator xniColumnNode = xniRowNode.Current.SelectChildren(XPathNodeType.Element);
while (xniColumnNode.MoveNext())
{
if (!columnsCreated)
{
DataColumn column = new DataColumn(xniColumnNode.Current.Name);
table.Columns.Add(column);
}
else
row[xniColumnNode.Current.Name] = xniColumnNode.Current.Value;
}
}
else
{
XPathNodeIterator xniColumnNode = xniRowNode.Clone();
bool onAttribute = xniColumnNode.Current.MoveToFirstAttribute();
while (onAttribute)
{
if (!columnsCreated)
{
DataColumn column = new DataColumn(xniColumnNode.Current.Name);
table.Columns.Add(column);
}
else
row[xniColumnNode.Current.Name] = xniColumnNode.Current.Value;
onAttribute = xniColumnNode.Current.MoveToNextAttribute();
}
}
if (!columnsCreated)
{
columnsCreated = true;
resetRow = true;
}
if (row != null)
table.Rows.Add(row);
}
}
}
return table.Rows.Count;
}
/// <summary>
/// (NOT IMPLEMENTED) Fill a data set with the content from a specified xml document object.
/// </summary>
/// <param name="dataset">Data set to be filled.</param>
/// <param name="source">Xml document object of which content will fill the data table.</param>
/// <param name="useNodes">True if nodes names should be used for columns names, otherwise attributes will be used.</param>
/// <returns>Number of filled rows.</returns>
public int Fill(DataSet dataset, XmlDocument source, bool useNodes)
{
throw new NotImplementedException();
}
#endregion
#region Private Methods
#endregion
}
}
As the name says this represents the connection class which will manage in an abstract way how a connection behaves. The interface exposes a set of methods and properties relevant to it. There are only three methods exposed and implemented:
using System.Collections.Generic;
using System.Net;
namespace HttpData.Client
{
/// <summary>
/// Provides functionality for connection management of different web sources. Is implemented by HDPConnection.
/// </summary>
public interface IHDPConnection
{
#region MEMBERS
#region METHODS
/// <summary>
/// Open connection.
/// </summary>
void Open();
/// <summary>
/// Close connection.
/// </summary>
void Close();
/// <summary>
/// Create a new HDPCommand object associated with this connection.
/// </summary>
/// <returns>HDPCommand object associated with this connection.</returns>
IHDPCommand CreateCommand();
#endregion
#region PROPERTIES
/// <summary>
/// Get or set connection url.
/// </summary>
string ConnectionURL { get; set; }
/// <summary>
/// Get or set the value which specifies if the connection should be maintained openend.
/// </summary>
bool KeepAlive { get; set; }
/// <summary>
/// Get or set the value which specifies if auto redirection is allowed.
/// </summary>
bool AutoRedirect { get; set; }
/// <summary>
/// Get or set the value which specifies if maximum number of auto redirections.
/// </summary>
int MaxAutoRedirects { get; set; }
/// <summary>
/// Get or set the value which specifies the user agent to be used.
/// </summary>
string UserAgent { get; set; }
/// <summary>
/// Get the value which specifies the state of the connection.
/// </summary>
HDPConnectionState ConnectionState { get; }
/// <summary>
/// Get or set the value which specifies the connection proxy.
/// </summary>
HDPProxy Proxy { get; set; }
/// <summary>
/// Get or set the value which specifies the coockies used by connection.
/// </summary>
CookieCollection Cookies { get; set; }
/// <summary>
/// Get or set the value which specifies the content type.
/// </summary>
string ContentType { get; set; }
/// <summary>
/// Get or set headers details used in HttpWebRequest operations.
/// </summary>
List<HDPConnectionHeader> Headers { get; set; }
/// <summary>
/// Get or set Http referer.
/// </summary>
string Referer { get; set; }
#endregion
#endregion
}
}
using System.Collections.Generic;
using System.Net;
namespace HttpData.Client
{
/// <summary>
/// Provides functionality for connection management of different web sources.
/// </summary>
public class HDPConnection : IHDPConnection
{
#region Private Variables
private HDPConnectionState _connectionState;
private string _connectionURL;
private HDPCache cache;
private bool useCache;
#endregion
#region Properties
/// <summary>
/// Get the value which specifies if caching will be used.
/// </summary>
public bool UseCahe
{
get { return useCache; }
}
/// <summary>
/// Get HDPCache object.
/// </summary>
public HDPCache Cache
{
get { return cache; }
}
#endregion
#region .ctor
/// <summary>
/// Instantiate a new HDPConnection object.
/// </summary>
public HDPConnection()
{
_connectionState = HDPConnectionState.Closed;
_connectionURL = "";
Cookies = new CookieCollection();
MaxAutoRedirects = 1;
}
/// <summary>
/// Instantiate a new HDPConnection object.
/// </summary>
/// <param name="connectionURL">Url of the web source.</param>
public HDPConnection(string connectionURL)
{
_connectionState = HDPConnectionState.Closed;
_connectionURL = connectionURL;
Cookies = new CookieCollection();
MaxAutoRedirects = 1;
}
/// <summary>
/// Instantiate a new HDPConnection object.
/// </summary>
/// <param name="cacheDefinitions">HDPCacheDefinition object used by caching mechanism.</param>
public HDPConnection(HDPCacheDefinition cacheDefinitions)
{
_connectionState = HDPConnectionState.Closed;
_connectionURL = "";
Cookies = new CookieCollection();
MaxAutoRedirects = 1;
cache = cacheDefinitions != null ? new HDPCache(cacheDefinitions) : null;
useCache = true;
}
/// <summary>
/// Instantiate a new HDPConnection object.
/// </summary>
/// <param name="connectionURL">Url of the web source.</param>
/// <param name="cacheDefinitions">HDPCacheDefinition object used by caching mechanism.</param>
public HDPConnection(string connectionURL, HDPCacheDefinition cacheDefinitions)
{
_connectionState = HDPConnectionState.Closed;
_connectionURL = connectionURL;
Cookies = new CookieCollection();
MaxAutoRedirects = 1;
cache = cacheDefinitions != null ? new HDPCache(cacheDefinitions) : null;
useCache = true;
}
#endregion
#region Public Methods
#endregion
#region IHDPConnection Members
#region Methods
/// <summary>
/// Open connection.
/// </summary>
public void Open()
{
_connectionState = HDPConnectionState.Open;
}
/// <summary>
/// Open connection using a specific url.
/// </summary>
/// <param name="connectionURL">Url of the web source.</param>
public void Open(string connectionURL)
{
_connectionURL = connectionURL;
_connectionState = HDPConnectionState.Open;
}
/// <summary>
/// Close connection.
/// </summary>
public void Close()
{
_connectionState = HDPConnectionState.Closed;
if (cache != null)
cache.CloseStorageConnection();
}
/// <summary>
/// Create a new IHDPCommand object associated with this connection.
/// </summary>
/// <returns>IHDPCommand object associated with this connection.</returns>
IHDPCommand IHDPConnection.CreateCommand()
{
HDPCommand command = new HDPCommand { Connection = this };
return command;
}
/// <summary>
/// Create a new HDPCommand object associated with this connection.
/// </summary>
/// <returns>HDPCommand object associated with this connection.</returns>
public HDPCommand CreateCommand()
{
HDPCommand command = new HDPCommand { Connection = this };
return command;
}
#endregion
#region Properties
/// <summary>
/// Get or set connection url.
/// </summary>
public string ConnectionURL
{
get { return _connectionURL; }
set { _connectionURL = value; }
}
/// <summary>
/// Get or set the value which specifies if auto redirection is allowed.
/// </summary>
public bool AutoRedirect { get; set; }
/// <summary>
/// Get or set the value which specifies if maximum number of auto redirections.
/// </summary>
public int MaxAutoRedirects { get; set; }
/// <summary>
/// Get or set the value which specifies if the connection should be maintained openend.
/// </summary>
public bool KeepAlive { get; set; }
/// <summary>
/// Get or set the value which specifies the user agent to be used.
/// </summary>
public string UserAgent { get; set; }
/// <summary>
/// Get or set the value which specifies the content type.
/// </summary>
public string ContentType { get; set; }
/// <summary>
/// Get or set the value which specifies the coockies used by connection.
/// </summary>
public CookieCollection Cookies { get; set; }
/// <summary>
/// Get the value which specifies the state of the connection.
/// </summary>
public HDPConnectionState ConnectionState
{
get { return _connectionState; }
}
/// <summary>
/// Get or set the value which specifies the connection proxy.
/// </summary>
public HDPProxy Proxy { get; set; }
/// <summary>
/// Get or set headers details used in HttpWebRequest operations.
/// </summary>
public List<HDPConnectionHeader> Headers { get; set; }
/// <summary>
/// Get or set Http referer.
/// </summary>
public string Referer { get; set; }
#endregion
#endregion
#region IDisposable Members
///<summary>
/// Dispose current object.
///</summary>
public void Dispose()
{
this.dispose();
System.GC.SuppressFinalize(this);
}
private void dispose()
{
if (_connectionState == HDPConnectionState.Open)
this.Close();
}
#endregion
}
}
This represents our engine which provides functionality for querying web resources and processing the result (response). It offers a variety of way that can be used to process the response content of the query as: XPath, RegEx, XSLT, reflection, etc. I will discuss in detail only the main methods, rest of them are leveraged on those and I assume the comments which accompany the methods are suffice to provide guidance in the right direction. But before I’ll reach the methods, let me present you the properties.
using System.Collections.Generic;
using System.IO;
using System.Xml.XPath;
namespace HttpData.Client
{
/// <summary>
/// Provides functionality for querying and processing data from different web sources. Is implemented by HDPCommand.
/// </summary>
public interface IHDPCommand
{
#region Members
#region Properties
/// <summary>
/// Get or set the command connection object.
/// </summary>
IHDPConnection Connection { get; set; }
/// <summary>
/// Get or set the command parameters collection.
/// </summary>
IHDPParameterCollection Parameters { get; }
/// <summary>
/// Get or set the command type.
/// </summary>
HDPCommandType CommandType { get; set; }
/// <summary>
/// Get or set the command text.
/// </summary>
string CommandText { get; set; }
/// <summary>
/// Get or set the command timeout.
/// </summary>
int CommandTimeout { get; set; }
/// <summary>
/// Get the response retrieved from the server.
/// </summary>
string Response { get; }
/// <summary>
/// Get web resource URI.
/// </summary>
string Uri { get; }
/// <summary>
/// Get web resource absolute path.
/// </summary>
string Path { get; }
/// <summary>
/// Get the last error occurend.
/// </summary>
string LastError { get; }
/// <summary>
/// Get the content length of response.
/// </summary>
long ContentLength { get; }
#endregion
#region Methods
/// <summary>
/// Get the parameters number.
/// </summary>
/// <returns>Number of parameters.</returns>
int GetParametersCount();
/// <summary>
/// Create a new IHDPParameter object.
/// </summary>
/// <returns>IHDPParameter parameter object.</returns>
IHDPParameter CreateParameter();
/// <summary>
/// Execute a expression against the web server and return the number of results.
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>Number of results determined by the expression.</returns>
int ExecuteNonQuery(bool clearParams);
/// <summary>
/// Execute a query against the web server and does not read the response stream.
/// </summary>
/// <returns>True is the command executed with success otherwise false.</returns>
bool Execute();
/// <summary>
/// Execute a query against the web server.
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>True is the command executed with success otherwise false.</returns>
bool Execute(bool clearParams);
/// <summary>
/// Execute a query against the web server.
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>Returns the underlying http response stream.</returns>
Stream ExecuteStream(bool clearParams);
/// <summary>
/// Closes the http response object..
/// Usable only with ExecuteStream method.
/// </summary>
void CloseResponse();
/// <summary>
/// Execute a query against the web server and return a XPathNavigator object used to navigate thru the query result.
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>XPathNavigator object used to navigate thru the query result.</returns>
XPathNavigator ExecuteNavigator(bool clearParams);
/// <summary>
/// Execute a query against the web server and return a IXPathNavigable object used to navigate thru the query result.
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>IXPathNavigable object used to navigate thru the query result.</returns>
IXPathNavigable ExecuteDocument(bool clearParams);
/// <summary>
/// Execute a query against the web server, on query reult it will apply a xpath expression and return a IXPathNavigable object used to navigate thru query result.
/// </summary>
/// <param name="expression">XPath expression.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>IXPathNavigable object used to navigate thru query result.</returns>
IXPathNavigable ExecuteDocument(string expression, bool clearParams);
/// <summary>
/// Execute a query against the web server and return a byte[] object which contains the binary query result. Used when querying binary content from web server (E.g: PDF files).
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>Byte array object which contains the binary query result.</returns>
byte[] ExecuteBinary(bool clearParams);
/// <summary>
/// Execute a query against the web server and return a byte[] object which contains the binary query result. Used when querying binary content from web server (E.g: PDF files).
/// </summary>
/// <param name="boundaryLimit">Specify the limit of the buffer which must be read.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>Byte array object which contains the binary query result.</returns>
byte[] ExecuteBinary(int boundaryLimit, bool clearParams);
/// <summary>
/// Execute a query against the web server and return a string object which contains the representation of the binary query result. Used when querying binary content from web server (E.g: PDF files).
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>String object which contains the representation of the binary query result.</returns>
string ExecuteBinaryConversion(bool clearParams);
/// <summary>
/// Execute a query against the web server and return a string object which contains the representation of the query result.
/// </summary>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>String object which contains the representation of the query result.</returns>
string ExecuteString(bool clearParams);
/// <summary>
/// Execute a query against the web server, on query reult it will apply a xpath expression and return a string object which contains the representation of the query result value.
/// </summary>
/// <param name="expression">XPath expression.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>String object which contains the representation of the query result value.</returns>
string ExecuteValue(string expression, bool clearParams);
/// <summary>
/// Execute a query against the web server, on query reult it will apply a regular expression and return a string object which contains the representation of the query result value.
/// </summary>
/// <param name="expression">Regular expression.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <param name="isRegEx">Specify the a regular expression is used, it must always be to true.</param>
/// <returns>String object which contains the representation of the query result value.</returns>
string ExecuteValue(string expression, bool clearParams, bool isRegEx);
/// <summary>
/// Execute a query against the web server, on query reult it will apply a regular expression and return a List object which contains the representation of the query result.
/// </summary>
/// <param name="expression">Regular expression.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <param name="isRegEx">Specify the a regular expression is used, it must always be to true.</param>
/// <returns>List object which contains the representation of the query result.</returns>
List<string> ExecuteCollection(string expression, bool clearParams, bool isRegEx);
/// <summary>
/// Execute a query against the web server, on query reult it will apply a xpath expression and return a List object which contains the representation of the query result.
/// </summary>
/// <param name="expression">XPath expression.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>List object which contains the representation of the query result.</returns>
List<string> ExecuteCollection(string expression, bool clearParams);
/// <summary>
/// Execute a query against the web server, on query reult it will apply a regular expression and return a string array object which contains the representation of the query result.
/// </summary>
/// <param name="expression">Regular expression.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <param name="isRegEx">Specify the a regular expression is used, it must always be to true.</param>
/// <returns>String array object which contains the representation of the query result.</returns>
string[] ExecuteArray(string expression, bool clearParams, bool isRegEx);
/// <summary>
/// Execute a query against the web server, on query reult it will apply a xpath expression and return a string array object which contains the representation of the query result.
/// </summary>
/// <param name="expression">XPath expression.</param>
/// <param name="clearParams">Specify if the parameters collection should be cleared after the query is executed.</param>
/// <returns>String array object which contains the representation of the query result.</returns>
string[] ExecuteArray(string expression, bool clearParams);
#endregion
#endregion
}
}
HDPCache, HDPCacheDefinition, HDPCacheObject and HDPCacheStorage are the classes which handle the cache. I will not insist on this subject since is not so important in this case. If you like you can study those classes in more detail by yourself. I think the code comments will help you to grasp they purpose and functionality quite fast.
The class HDPCacheObject is straight forward; it contains a set of properties which define the cache behavior. Here are its properties:
using System;
namespace HttpData.Client
{
///<summary>
/// Defines the cache options.
///</summary>
public class HDPCacheDefinition
{
#region Public Variables
/// <summary>
/// Specifies the date until which the cache is valid.
/// </summary>
public DateTime StorageActiveUntil = DateTime.Now.AddDays(1);
/// <summary>
/// Specifies the limit size of the cache memory.
/// </summary>
public long MemorySizeLimit;
/// <summary>
/// Specifies the limit number of objects which can be stored in the cache.
/// </summary>
public int ObjectsNumberLimit = 10000;
/// <summary>
/// Specifies if disk storage will be used.
/// </summary>
public bool UseStorage = true;
///<summary>
/// Specifies if the data should be retrieved from the disk storage.
///</summary>
public bool RetrieveFromStorage;
/// <summary>
/// Specifies if the persistance of the cache on disk will be done in real time.
/// </summary>
public bool RealtimePersistance;
/// <summary>
/// Specifies the name of the file of the disk storage.
/// </summary>
public string StorageName = "HttpDataProcessorCahe.che";
#endregion
}
}
using System;
namespace HttpData.Client
{
/// <summary>
/// Container for the cached data based on key value pair.
/// </summary>
[Serializable]
public class HDPCacheObject
{
#region Private Variables
private string key;
private object value;
private DateTime cacheDate;
#endregion
#region Properties
/// <summary>
/// Get or set the cache object key.
/// </summary>
public string Key
{
get { return key; }
set { key = value; }
}
/// <summary>
/// Get or set the cache object value.
/// </summary>
public object Value
{
get { return value; }
set { this.value = value; }
}
/// <summary>
/// Get or set the cache object date.
/// </summary>
public DateTime CacheDate
{
get { return cacheDate; }
}
#endregion
#region .ctor
/// <summary>
/// Instantiate a new HDPCacheObject object.
/// </summary>
public HDPCacheObject()
{
cacheDate = DateTime.Now;
}
/// <summary>
/// Instantiate a new HDPCacheObject object.
/// </summary>
/// <param name="key">Key for the cache object</param>
/// <param name="value">Value for the cache object</param>
public HDPCacheObject(string key, object value)
{
this.key = key;
this.value = value;
cacheDate = DateTime.Now;
}
#endregion
#region Public Methods
#endregion
#region Private Methods
#endregion
}
}
I will provide a couple of examples so you can figure it out how things work. I consider this to be the best way to understand how the earth spins.
Let us say for example that we would like to retrieve all Florida cities from the following page: http://www.stateofflorida.com/Portal/DesktopDefault.aspx?tabid=34.
Here is the code to achieve the above mentioned task.
using System;
using System.Collections.Generic;
using HttpData.Client;
namespace CityStates
{
class Program
{
static void Main(string[] args)
{
private const string connectionUrl = "http://www.stateofflorida.com/Portal/DesktopDefault.aspx?tabid=34";
//Create a new instance of HDPCacheDefinition object.
HDPCacheDefinition cacheDefinition = new HDPCacheDefinition
{
UseStorage = false,
StorageActiveUntil = DateTime.Now,
ObjectsNumberLimit = 10000,
RealtimePersistance = false,
RetrieveFromStorage = false,
//We will not use a disk storage
StorageName = null
};
//Create a new instance of HDPConnection object.
//Pass as parameters the initial connection URL and the cache definition object.
HDPConnection connection = new HDPConnection(connectionUrl, cacheDefinition)
{
//Define the content type we would expect.
ContentType = HDPContentType.TEXT,
//We want to allow autoredirects
AutoRedirect = true,
//Do not perform more than 10 autoredirects
MaxAutoRedirects = 10,
//The user agent is FireFox 3
UserAgent = HDPAgents.FIREFOX_3,
//We do not want to use a proxy
Proxy = null // If you want to use a proxy: Proxy = new HDPProxy("http://127.0.0.1:999/"/*This is your proxy address and its port*/, "PROXY_USER_NAME", "PROXY_PASSWORD")
};
//Open the connection
connection.Open();
//Create a new instance of HDPCommand object.
//Pass as parameter the HDPConnection object.
HDPCommand command = new HDPCommand(connection)
{
//Activate the memory cache for fast access on same web resource multiple times
ActivatePool = true,
//We will perform an GET action
CommandType = HDPCommandType.Get,
//Set the time out period
CommandTimeout = 60000,
//Use MSHTML library instead of HtmlAgilityPack (if the value is false then HtmlAgilityPack would be used)
UseMsHtml = true
};
//Execute the query on the web resource. The received HTTPWebResponse content will be converted to XML and the XPath expression will be executed.
//The method will return the list of Florida state cities.
List cities = command.ExecuteCollection("//ul/li/b//text()[normalize-space()]", true);
foreach (string city in cities)
Console.WriteLine(city);
connection.Close();
}
}
}
using System;
using System.Collections.Generic;
using HttpData.Client;
namespace CityStates
{
class Program
{
static void Main(string[] args)
{
private const string connectionUrl = "https://www.linkedin.com/secure/login?trk=hb_signin";
//Create a new instance of HDPCacheDefinition object.
HDPCacheDefinition cacheDefinition = new HDPCacheDefinition
{
UseStorage = false,
StorageActiveUntil = DateTime.Now,
ObjectsNumberLimit = 10000,
RealtimePersistance = false,
RetrieveFromStorage = false,
//We will not use a disk storage
StorageName = null
};
//Create a new instance of HDPConnection object.
//Pass as parameters the initial connection URL and the cache definition object.
HDPConnection connection = new HDPConnection(connectionUrl, cacheDefinition)
{
//Define the content type we would expect.
ContentType = HDPContentType.TEXT,
//We want to allow autoredirects
AutoRedirect = true,
//Do not perform more than 10 autoredirects
MaxAutoRedirects = 10,
//The user agent is FireFox 3
UserAgent = HDPAgents.FIREFOX_3,
//We do not want to use a proxy
Proxy = null // If you want to use a proxy: Proxy = new HDPProxy("http://127.0.0.1:999/"/*This is your proxy address and its port*/, "PROXY_USER_NAME", "PROXY_PASSWORD")
};
//Open the connection
connection.Open();
//Create a new instance of HDPCommand object.
//Pass as parameter the HDPConnection object.
HDPCommand command = new HDPCommand(connection)
{
//Activate the memory cache for fast access on same web resource multiple times
ActivatePool = true,
//We will perform an GET action
CommandType = HDPCommandType.Get,
//Set the time out period
CommandTimeout = 60000,
//Use HtmlAgilityPack (if the value is true then MSHTML would be used)
UseMsHtml = false
};
//Define the query parameters used in the POST action.
//The actual parameter name used by a browser to authenticate you on Linkedin is without '@' sign.
//Use a HTTP request analyzer and you will notice the difference.
//This is how the actual POST body will look like: csrfToken="ajax:-3801133150663455891"&session_key="YOUR_EMAIL@gmail.com"&session_password="YOUR_PASSWORD"&session_login="Sign+In"&session_login=""&session_rikey=""
HDPParameterCollection parameters = new HDPParameterCollection();
HDPParameter pToken = new HDPParameter("@csrfToken", "ajax:-3801133150663455891");
HDPParameter pSessionKey = new HDPParameter("@session_key", "YOUR_EMAIL@gmail.com");
HDPParameter pSessionPass = new HDPParameter("@session_password", "YOUR_PASSWORD");
HDPParameter pSessionLogin = new HDPParameter("@session_login", "Sign+In");
HDPParameter pSessionLogin_ = new HDPParameter("@session_login", "");
HDPParameter pSessionRiKey = new HDPParameter("@session_rikey", "");
parameters.Add(pToken);
parameters.Add(pSessionKey);
parameters.Add(pSessionPass);
parameters.Add(pSessionLogin);
parameters.Add(pSessionLogin_);
parameters.Add(pSessionRiKey);
//If everything went ok then linkeding will ask us to redirect (unfortunately autoredirect doesn't work in this case).
//Get the manual redirect URL value.
string value = command.ExecuteValue("//a[@id='manual_redirect_link']/@href", true);
if (value != null && String.Compare(value, "http://www.linkedin.com/home") == 0)
{
command.Connection.ConnectionURL = value;
command.CommandType = HDPCommandType.Get;
//Using the manual redirect URL, check if the opened web page contains the welcome message.
//If it does contain the message, then we are in.
string content = command.ExecuteString("//title[contains(.,'Welcome,')]", true);
if (content.Length > 0)
Console.WriteLine(content);
else
Console.WriteLine("Login failed!");
}
connection.Close();
}
}
}
On your sample project please add the following app.config content if you are going to use MSHTML.
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<appSettings>
<add key="LogFilePath" value="..\Log\My-Log.txt"/>
<add key="HtmlTagsPath" value="HtmlTags.txt"/>
<add key="AttributesTagsPath" value="HtmlAttributes.txt"/>
</appSettings>
</configuration>
You can download the source code of the library and a sample project from here