
private string GetWellFormedHTML(string uri,string xpath) ...{
StreamReader sReader = null;
StringWriter sw = null;
SgmlReader reader = null;
XmlTextWriter writer = null;
try ...{
if (uri == String.Empty) uri = "http://www.XMLforASP.NET";
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse res = (HttpWebResponse)req.GetResponse();
sReader = new StreamReader(res.GetResponseStream());
reader = new SgmlReader();
reader.DocType = "HTML";
reader.InputStream = new StringReader(sReader.ReadToEnd());
sw = new StringWriter();
writer = new XmlTextWriter(sw);
writer.Formatting = Formatting.Indented;
//writer.WriteStartElement("Test");
while (reader.Read()) ...{
if (reader.NodeType != XmlNodeType.Whitespace) ...{
writer.WriteNode(reader, true);
}
}
//writer.WriteEndElement();
if (xpath == null) ...{
return sw.ToString(); 
} else ...{ //Filter out nodes from HTML
StringBuilder sb = new StringBuilder();
XPathDocument doc = new XPathDocument(new StringReader(sw.ToString()));
XPathNavigator nav = doc.CreateNavigator();
XPathNodeIterator nodes = nav.Select(xpath);
while (nodes.MoveNext()) ...{
sb.Append(nodes.Current.Value + " ");
}
return sb.ToString();
}
} catch (Exception exp) ...{
writer.Close();
reader.Close();
sw.Close();
sReader.Close();
return exp.Message;
}
}
本文介绍了一个C#方法,用于从指定URL抓取HTML内容并将其转换为格式化的XML字符串。如果提供了XPath表达式,则该方法进一步筛选出所需节点。

161

被折叠的 条评论
为什么被折叠?



