Jeg lavede engang dette her stykke kode til at illustrere:
using System;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace E
{
public class HttpDownloadCharset
{
private static Regex encpat = new Regex("charset=([A-Za-z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static string ParseContentType(string contenttype)
{
Match m = encpat.Match(contenttype);
if(m.Success)
{
return m.Groups[1].Value;
}
else
{
return "ISO-8859-1";
}
}
private static Regex metaencpat = new Regex("<META\\s+HTTP-EQUIV\\s*=\\s*[\"']Content-Type[\"']\\s+CONTENT\\s*=\\s*[\"']([^\"']*)[\"']>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static string ParseMetaContentType(String html, String defenc)
{
Match m = metaencpat.Match(html);
if(m.Success)
{
return ParseContentType(m.Groups[1].Value);
} else {
return defenc;
}
}
private const int DEFAULT_BUFSIZ = 1000000;
public static string Download(string urlstr)
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlstr);
using(HttpWebResponse resp = (HttpWebResponse)req.GetResponse())
{
if (resp.StatusCode == HttpStatusCode.OK)
{
string enc = ParseContentType(resp.ContentType);
int bufsiz = (int)resp.ContentLength;
if(bufsiz < 0) {
bufsiz = DEFAULT_BUFSIZ;
}
byte[] buf = new byte[bufsiz];
Stream stm = resp.GetResponseStream();
int ix = 0;
int n;
while((n = stm.Read(buf, ix, buf.Length - ix)) > 0) {
ix += n;
}
stm.Close();
string temp = Encoding.ASCII.GetString(buf);
enc = ParseMetaContentType(temp, enc);
return Encoding.GetEncoding(enc).GetString(buf);
}
else
{
throw new ArgumentException("URL " + urlstr + " returned " + resp.StatusDescription);
}
}
}
}
public class Program
{
public static void Main(string[] args)
{
Console.WriteLine(HttpDownloadCharset.Download("
http://arne:81/~arne/f1.html"));
Console.WriteLine(HttpDownloadCharset.Download("
http://arne:81/~arne/f2.html"));
Console.WriteLine(HttpDownloadCharset.Download("
http://arne:81/~arne/f3.html"));
}
}
}