regex - c# regular expression for finding links in <a> with specific ending -
i need regex pattern finding links in string (with html code) links file endings .gif or .png
example string:
<a href="//site.com/folder/picture.png" target="_blank">picture.png</a> for between " " , text between <a> , </a>.
i want this:
href = //site.com/folder/picture.png string = picture.png
my code far:
using system; using system.collections.generic; using system.componentmodel; using system.data; using system.diagnostics; using system.drawing; using system.linq; using system.net; using system.text; using system.text.regularexpressions; using system.threading.tasks; using system.windows.forms; namespace downloader { public partial class form1 : form { public form1() { initializecomponent(); } private void button1_click(object sender, eventargs e) { string url = textbox1.text; string s = gethtmlcode(url); foreach (linkitem in linkfinder.find(s)) { richtextbox1.text += convert.tostring(i); } } static string gethtmlcode(string url) { using (webclient client = new webclient()) { string htmlcode = client.downloadstring(url); return htmlcode; } } public struct linkitem { public string href; public string text; public override string tostring() { return href + "\n\t" + text + "\n\t"; } } static class linkfinder { public static list<linkitem> find(string file) { list<linkitem> list = new list<linkitem>(); // 1. // find matches in file. matchcollection m1 = regex.matches(file, @"(<a.*?>.*?</a>)", regexoptions.singleline); // 2. // loop on each match. foreach (match m in m1) { string value = m.groups[1].value; linkitem = new linkitem(); // 3. // href attribute. match m2 = regex.match(value, @"href=\""(.*?)\""", regexoptions.singleline); if (m2.success) { i.href = m2.groups[1].value; } // 4. // remove inner tags text. string t = regex.replace(value, @"\s*<.*?>\s*", "", regexoptions.singleline); i.text = t; list.add(i); } return list; } } } }
i can suggest using htmlagilitypack task. install using manage nuget packages solution menu, , add following method:
/// <summary> /// collects href attribute values , node values if image extension jpg or png /// </summary> /// <param name="html">html string or url</param> /// <returns>a key-value pair list of href values , node values</returns> private list<keyvaluepair<string, string>> getlinkswithhtmlagilitypack(string html) { var result = new list<keyvaluepair<string, string>>(); htmlagilitypack.htmldocument hap; uri uriresult; if (uri.trycreate(html, urikind.absolute, out uriresult) && uriresult.scheme == uri.urischemehttp) { // html url var doc = new htmlagilitypack.htmlweb(); hap = doc.load(uriresult.absoluteuri); } else { // html string hap = new htmlagilitypack.htmldocument(); hap.loadhtml(html); } var nodes = hap.documentnode.selectnodes("//a"); if (nodes != null) foreach (var node in nodes) if (path.getextension(node.innertext.trim()).tolower() == ".png" || path.getextension(node.innertext.trim()).tolower() == ".jpg") result.add(new keyvaluepair<string,string>(node.getattributevalue("href", null), node.innertext)); return result; } then, use (i using dummy string, demo)
var result = getlinkswithhtmlagilitypack("<a href=\"//site.com/folder/picture.png\" target=\"_blank\">picture.png</a><a href=\"//site.com/folder/picture.bmp\" target=\"_blank\">picture.bmp</a>"); output:

or, url, like:
var result = getlinkswithhtmlagilitypack("http://www.google.com");
Comments
Post a Comment