Log in

View Full Version : Parsing webpages and adding Product Microdata.


OfMonsterAndMen
January 28th, 2013, 14:37
So this SEO thing is kinda keeping me going and I've been steadily reading about how to direct traffic flow from 1 area of the internet to the area I desire it to be. After reading about how including microdata can increase Click through rates by almost a third I was intrigued and decided to implement a parser to update the 150k+ products on my target site automatically. So I started by analyzing the source of the page and reading up on mono develop the .net equivalent for Linux. After sometime reading the source I noticed repeating structures that could be used to my advantage in including the required elements. Tables of items are a great because the overall layout is repeated for each item like so.

Code:

<table>
<tr>
<td>
item specific data
</td>
<td>
another items data
</td>
...
</tr>
</table>



So this was great from a programmers aspect but all the tables are variable length and include tons of markup not really related to the item but to the target site. I overcame this by simple use of layout recognition and text modification. Included below is the source to my endeavor and would encourage all people in this field to modify and use my code.

Code:

public virtual void AddMicroDataToTableOfProducts(object sender, EventArgs args)
{
/*Declare Variables*/
//BaseUri is the Base of all webpages scraped by this tool
String BaseUri = "http://www.target-site.com/subcategory";
//Products declares a array of html pages to scrape.
String[] Products = {
"/aprons/186.html",
"/boots/457.html",
"/chef-hats/452.html",
"/chef-coats-and-jackets/453.html",
"/chef-pants/454.html",
"/gloves-disposable/214.html",
"/hairnets/22.html",
"/oven-mitts-and-pot-holders/98.html",
"/protective-wear/456.html",
"/safety-wear/234.html",
"/shirts/450.html",
"/shoes-and-clogs/451.html",
"/waitstaff-attire/458.html",
"/bar-and-drink-mixes/11.html",
"/coffee/12.html",
"/dairy/14.html",
"/juices/13.html",
"/smoothies/15.html",
"/soda/16.html",
"/soy-milk/18.html",
"/syrups/19.html",
"/teas/17.html",
"/water/20.html",
"/bags/449.html",
"/bakery-and-deli-tissue/24.html",
"/baking-cups/283.html",
"/bathroom-tissue/25.html",
"/bibs/26.html",
"/bowls-disposable/27.html",
"/boxes-and-circles/28.html",
"/coasters/29.html",
"/containers-and-lids/30.html",
"/cups-disposable/31.html",
"/doilies/32.html",
"/facial-tissue/33.html",
"/filters/209.html",
"/foils-and-plastic-wrap/35.html",
"/food-trays/36.html",
"/freezer-paper/37.html",
"/gloves-disposable/214.html",
"/green-for-disposable-items/249.html",
"/guest-checks/39.html",
"//hairnets/22.html",
"/kraft-paper/40.html"
};
//Product Titles.
String[] ProductTitles = {
"Cooking & Food Preparation aprons",
"Restaurant Footwear",
"Chef Hats",
"Chef Coats & Jackets",
"Chef Pants",
"Disposable Gloves",
"Hairnets",
"Oven mitts & pot holders",
"Protective wear",
"Safety wear",
"Shirts",
"Shoes & Clogs",
"Waitstaff Attire",
"Bar and Drink mixes",
"Coffee",
"Dairy Products",
"Juices",
"Smoothies",
"Soda",
"Soy Milk",
"Syrups",
"Teas",
"Water",
"Bags",
"Bakery and Deli Tissue",
"Baking Cups",
"Bathroom Tissue",
"Bibs",
"Disposable Bowls",
"Packaging and Boxes",
"Coasters",
"Containers with Lids",
"Disposable Cups",
"Doilies",
"Facial Tissues",
"Filters",
"Foils and Plastic Wrap",
"Food Trays",
"Freezer Paper",
"Disposable Gloves",
"Disposable renewable items",
"Guest Checks",
"Hairnets & Hats",
"Craft Paper"
};

int[] ProductPageId = { 186,457,452,453,454,214,22,98,456,234,450,
451,458,11,12,14,13,15,16,18,19,17,20,429,24,283,25,26,27,28,29,30,31,
32,33,209,35,36,37,214,249,39,22,40
};
String SearchString = "?searchsubcategory=";
String Pagesize = "&pagesize=1000000";
/*start code*/
using(WebClient wc = new WebClient())
{
//http://www.target-site.com/subcategory + Products[pageid] + SearchString + PageSize

//start from 0 and go up to the number of products in the array above
for( int ProductId = 0; ProductId < Products.Length;ProductId++)
{
//Get the name of the product page we are going to work with
String Element = Products[ProductId];
//Form the Universal Resource Indicator string by concating the strings together
Uri UriSearchString = new Uri( BaseUri + Element + SearchString + ProductPageId[ProductId] + Pagesize);
//use the web client api DownloadString to post a HTTP GET to the resulting URL
String Html = wc.DownloadString(UriSearchString);

int StartTag = 0;
if(Html.Length != 0)
{
//Find the index of the first table by searching for the start of the string that begins with
//<table the open table tag.
int TableStart = Html.IndexOf("<table";
//Find the Index of the end of the table by searching for </table the close table tag.
int TableEnd = Html.IndexOf("</table", TableStart);
//Use the index of the End of the table to find the beginning of the next table.
TableStart = Html.IndexOf("<table",TableEnd);
//use the index of start to find the end of the table.
TableEnd = Html.IndexOf("</table",TableStart);
//Get total table length by minusing the end from the beginning and adding the size
//of the </table> tag which is 8.
int TotalStringLength = TableEnd - TableStart + 8;
//Use the Total string length to copy the table to the Html String instead of the whole website.
Html = Html.Substring(TableStart,TotalStringLength);
//Replace images and links with the full path so it is visible when
//posting to another site.
Html = Html.Replace("/products","http://www.target-site.com/products";
Html = Html.Replace("/_imageresize","http://www.target-site.com/_imageresize";
Html = Html.Replace("/_resources","http://www.target-site.com/_resources";
TableStart = Html.IndexOf("<table";
if(TableStart == -1)
{
break;
}
//Insert Table Title
Html = Html.Insert(TableStart, ProductTitles[ProductId]);
TableEnd = Html.IndexOf(">",TableStart);
if(TableEnd == -1)
{
break;
}
//modify the table by adding the border attribute
Html = Html.Insert(TableEnd," border=\"2\"";
TableStart = Html.IndexOf("<tr",TableStart);
if(TableStart == -1)
{
break;
}
TableEnd = Html.IndexOf("</table>",TableStart);
TotalStringLength = TableEnd - TableStart + 5;
do
{
//locate first div and update with schema information.
StartTag = Html.IndexOf("<div",StartTag);
if(StartTag == -1)
{
break;
}
/*
* Insert a Div tag to wrap up the Product information.
* This is done to avoid any modification to a4r and
* act more of a additive to a4r.
*/
Html = Html.Insert(StartTag,"<div itemscope itemtype=\"http://schema.org/Product\">";
TotalStringLength += 52;
//Locate the first link tag
StartTag = Html.IndexOf("<a",StartTag);
if(StartTag == -1)
{
break;
}
Html = Html.Insert(StartTag+2,"itemprop=\"url\"";
//insert the item property that describes the link as the link to the product. Html = Html.Insert(StartTag+2," itemprop=\"url\"";
TotalStringLength += 14;
//Locate the first image tag.
StartTag = Html.IndexOf("<img",StartTag);
if(StartTag == -1)
{
break;
}
//Insert item property that marks the Product image as the Image of the product.
Html = Html.Insert(StartTag+4," itemprop=\"image\"";
TotalStringLength += 17;
StartTag = Html.IndexOf("</div>",StartTag);
if(StartTag == -1)
{
break;
}
StartTag = Html.IndexOf("<a href=", StartTag);
if(StartTag == -1)
{
break;
}
StartTag = Html.IndexOf("<a href=", StartTag+8);
StartTag = Html.IndexOf("<div>";
if(StartTag == -1)
{
break;
}
//Insert the property that describes Product name.
Html = Html.Insert(StartTag+4," itemprop=\"name\"";
TotalStringLength +=16;
StartTag = Html.IndexOf("<div style=",StartTag);
if(StartTag == -1)
{
break;
}
//Insert the microdata that describes the manufacturer
Html = Html.Insert(StartTag+4," itemprop=\"manufacturer\" itemscope itemtype=\"http://schema.org/Organization\" ";
TotalStringLength += 77;
StartTag = Html.IndexOf(">",StartTag);
if(StartTag == -1)
{
break;
}
//insert the Html that marks the manufacturers name.
Html = Html.Insert(StartTag+1,"<span itemprop=\"name\">";
TotalStringLength += 22;
StartTag = Html.IndexOf("</div>",StartTag);
if(StartTag == -1)
{
break;
}
//Close up Manufacturer Name
Html = Html.Insert(StartTag,"</span>";
TotalStringLength += 7;
StartTag = Html.IndexOf("<div class=\"ProductBidPrice\">",StartTag);
if(StartTag == -1)
{
break;
}
//Close off the Microdata div wrapper.
Html = Html.Insert(StartTag+29,"</div>";
TotalStringLength += 6;
StartTag = Html.IndexOf("</td>",StartTag);
}while(StartTag < TotalStringLength);
System.IO.TextWriter writeHtml = new StreamWriter("/home/****/Desktop/a4r/"+ProductTitles[ProductId]+".html";
writeHtml.Write(Html.ToString());
writeHtml.Flush();
writeHtml.Close();
}
}
}
}
}
}


regards