从Google ReCaptcha Scraping获取Bad Captcha图像

 2023-01-01 16:11


为什么不立即将图片直接下载到PictureBox中,这样可以减少CPU使用率和内存,这个解决方案适用于任何其他更高级的验证码服务,称为Solve Media(如果您查看图像URL,则使用Solve Media)下次你试图查看它时,它会给你一个假的错误catpcha图像).

但是现在我需要支持ReCaptcha Captcha系统,以便以更快的速度自动化我的机器人,然后只刷新网页并等待渲染.

所以我就写我的代码在这里,据我了解,我只是缺少仿效HTML请求的属性之一,我得到了用户代理伪造作为一个真正的Internet Explorer 8,我认为这个问题是饼干看起来有点它生成一个我无法弄清楚在哪里的cookie,但我也想通过下载Javascript文件得到一个Cookie.


这是Bad Captcha和Good Captcha的一个例子






Dim newCaptcha = New Captcha
Dim myUserAgent As String = ""
Dim myReferer As String = "http://www.google.com/recaptcha/demo/"
Dim outputSite As String = HTTP.HTTPGET("http://www.google.com/recaptcha/demo/", "", "", "", myUserAgent, myReferer)
Dim recaptchaChallengeKey = GetBetween(outputSite, "http://www.google.com/recaptcha/api/challenge?k=", """")

'Google ReCaptcha Captcha
outputSite = HTTP.HTTPGET("http://www.google.com/recaptcha/api/challenge?k=" & recaptchaChallengeKey, "", "", "", myUserAgent, myReferer)

'outputSite = outputSite.Replace("var RecaptchaState = {", "{""RecaptchaState"": {")
'outputSite = outputSite.Replace("};", "}}")
'Dim jsonDictionary As Dictionary(Of String, Object) = New JavaScriptSerializer().Deserialize(Of Dictionary(Of String, Object))(outputSite)
Dim recaptchaChallenge = GetBetween(outputSite, "challenge : '", "',")
outputSite = HTTP.HTTPGET("http://www.google.com/recaptcha/api/js/recaptcha.js", "", "", "", myUserAgent, myReferer) 'This page looks useless but it seems the javascript loads this anyways, maybe this why I get bad captchas?

If HTTP.LoadWebImageToPictureBox(newCaptcha.picCaptcha, "http://www.google.com/recaptcha/api/image?c=" & recaptchaChallenge, myUserAgent, myReferer) = False Then
    MessageBox.Show("Recaptcha Image loading failed!")
    Dim newWork As New Work
    newWork.CaptchaForm = newCaptcha
    newWork.AccountId = 1234 'ID of Accounts.
    newWork.CaptchaHash = "recaptcha_challenge_field=" & recaptchaChallenge
    newWork.CaptchaType = "ReCaptcha"
End If


Imports System.Collections.Generic
Imports System.Linq
Imports System.Text
Imports System.Net
Imports System.IO
Public Class HTTP

    Public StoredCookies As New CookieContainer

    Public Function HTTPGET(ByVal url As String, ByVal proxyname As String, ByVal proxylogin As String, ByVal proxypassword As String, ByVal userAgent As String, ByVal referer As String) As String
        Dim resp As HttpWebResponse
        Dim req As HttpWebRequest = DirectCast(WebRequest.Create(url), HttpWebRequest)

        If userAgent = "" Then
            userAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
        End If
        req.UserAgent = userAgent
        req.Referer = referer
        req.AllowAutoRedirect = True
        req.ReadWriteTimeout = 5000
        req.CookieContainer = StoredCookies
        req.Headers.Set("Accept-Language", "en-us")

        req.KeepAlive = True
        req.Method = "GET"

        Dim stream_in As StreamReader

        If proxyname <> "" Then
            Dim proxyIP As String = proxyname.Split(New Char() {":"})(0)
            Dim proxyPORT As Integer = CInt(proxyname.Split(New Char() {":"})(1))

            Dim proxy As New WebProxy(proxyIP, proxyPORT)
            'if proxylogin is an empty string then don't use proxy credentials (open proxy)
            If proxylogin <> "" Then
                proxy.Credentials = New NetworkCredential(proxylogin, proxypassword)
            End If
            req.Proxy = proxy
        End If

        Dim response As String = ""
            resp = DirectCast(req.GetResponse(), HttpWebResponse)
            stream_in = New StreamReader(resp.GetResponseStream())
            response = stream_in.ReadToEnd()
        Catch ex As Exception
        End Try
        Return response
    End Function

    Public Function LoadWebImageToPictureBox(ByVal pb As PictureBox, ByVal ImageURL As String, ByVal userAgent As String, ByVal referer As String) As Boolean
        Dim bAns As Boolean

            Dim resp As WebResponse
            Dim req As HttpWebRequest

            Dim sURL As String = Trim(ImageURL)

            If Not sURL.ToLower().StartsWith("http://") Then sURL = "http://" & sURL

            req = DirectCast(WebRequest.Create(sURL), HttpWebRequest)

            If userAgent = "" Then
                userAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
            End If
            req.UserAgent = userAgent
            req.Referer = referer
            req.AllowAutoRedirect = True
            req.ReadWriteTimeout = 5000
            req.CookieContainer = StoredCookies
            req.Headers.Set("Accept-Language", "en-us")

            req.KeepAlive = True
            req.Method = "GET"

            resp = req.GetResponse()
            If Not resp Is Nothing Then
                Dim remoteStream As Stream = resp.GetResponseStream()
                Dim objImage As New MemoryStream
                Dim bytesProcessed As Integer = 0
                Dim myBuffer As Byte()
                ReDim myBuffer(1024)
                Dim bytesRead As Integer
                bytesRead = remoteStream.Read(myBuffer, 0, 1024)
                Do While (bytesRead > 0)
                    objImage.Write(myBuffer, 0, bytesRead)
                    bytesProcessed += bytesRead
                    bytesRead = remoteStream.Read(myBuffer, 0, 1024)
                pb.Image = Image.FromStream(objImage)
                bAns = True
            End If
        Catch ex As Exception
            bAns = False
        End Try

        Return bAns
    End Function
End Class

编辑:我想出了这个谷歌Javascript Clientside混淆加密系统的问题




1 个回答
  • 我有同样的问题,并找到了一个解决方案,它不会提供最简单的验证码,但至少图像更容易.结果将是一个可读的单词和一个模糊的.


    data = UrlMgr("http://www.google.com/recaptcha/api/challenge?k=%s&cachestop=%.17f" % (id, random.random()), referer=referer, nocache=True).data
    challenge = re.search("challenge : '(.*?)',", data).group(1)
    server = re.search("server : '(.*?)',", data).group(1)
    # this step is super important to get readable captchas - normally we could take the "c" from above and already retrieve a captcha but
    # this one would be barely readable
    reloadParams["c"] = challenge
    reloadParams["k"] = id
    reloadParams["lang"] = "de"
    reloadParams["reason"] = "i"
    reloadParams["type"] = "image"
    data = UrlMgr("http://www.google.com/recaptcha/api/reload" , params=reloadParams, referer=referer, nocache=True).data
    challenge = textextract(data, "Recaptcha.finish_reload('", "',")
    return challenge, solveCaptcha(UrlMgr("%simage" % (server), params={"c":challenge}, referer=referer))


    2023-01-01 16:13 回答
