Automazione Browser - Facebook/Linkedin Data Collector Bot

Questo articolo mostra come sia possibile con Delphi automatizzare un browser per utilizzarlo in tutte le sue funzionalità compreso JavaScript, nello specifico un browser Chrome. Automatizzare una perfetta coppia di Chrome ci permette  ci permette di utilizzare un browser esattamente come fossimo un utente reale in quanto tutti gli input possono essere temporizzati e randomizzati per non farsi notare, se necessario.

Come esercizio di stile possiamo ad esempio raccogliere tutta la lista amici di un soggetto 0 di Facebook e accumulare una grossa quantità di dati su cui poi estrapolare ciò che desideriamo. Lo stesso dicasi per linkedin.

Per farlo bisogna scaricare questo framework per Delphi qui,
Possiamo partire dal demo miniBrowser ma prima di tutto il mio consiglio e di mettere il file binari del framework in una unica cartella nella quale vengono generati gli exe. In pratica tutti gli eseguibili che creeremo, come in questo caso fbnv.exe,  devono essere nella stessa cartella in cui sono presenti i file di questa immagine




Sul codice prima di tutto dobbiamo impostare alcuni record che conterranno i dati che ci interessa leggere sul web.

  MINIBROWSER_HOMEPAGE = 'https://www.facebook.com';

Type FBcontacts = record
  FBlink: string;
  FBName: string;
end;
Type LKDcontacts = record
  LKDlink: string;
  LKDName: string;
  LKDEmail: string;
  LKDTwitter: string;
end;


Alla Form principale aggiungiamo le procedure che parsano il sorgente l'html ricevuto e la relativa aggiunta ad una lista di quei record sopra.

  public
    // facebook
    procedure FacebookStart;
    procedure ParseAll (aList: Tlist<FBContacts> );
    procedure AddToFBList ( aList: Tlist<FBContacts>; aFBcontact: FBContacts );
  //  function GetFBId (aFBlink: string ): string;
    // linkedin
    procedure ParseLinkedin ;
    procedure AddToLKDList ( aLKDcontact: LKDContacts );
  end;

var
  MiniBrowserFrm : TMiniBrowserFrm;
  FBList,FBlist2 : TList<FBcontacts>;
  LKDList : TList<LKDcontacts>;
  aString: string;
  FbProfileDone,LKDProfileDone,AppTerminate: boolean;
  MyStart : integer;
  dir_FBprofiles, dir_LKDprofiles: string;
implementation

{$R *.dfm}


Queste sono le liste che conterranno i dati di Facebook e Linkedin e i relativi percorsi in cui andranno storati.


procedure TMiniBrowserFrm.FormCreate(Sender: TObject);
begin
  FBList := TList<FBContacts>.create;
  FBList2 := TList<FBContacts>.create;
  LKDList := TList<LKDContacts>.create;
  MyStart := 1;
  AppTerminate := false;
  dir_FBprofiles := ExtractFilePath (Application.ExeName ) + 'FBprofiles\';
  if not directoryExists ( dir_FBprofiles ) then mkdir ( dir_FBprofiles );
  dir_LKDprofiles := ExtractFilePath (Application.ExeName ) + 'LKDprofiles\';
  if not directoryExists ( dir_LKDprofiles ) then mkdir ( dir_LKDprofiles );

end;

procedure TMiniBrowserFrm.FormDestroy(Sender: TObject);
begin
  FBList.Free;
  FBList2.Free;
  LKDList.Free;
end;


Nello specifico ogni contatto Facebook/Linkedin viene aggiunto solo se già non è presente nella lista. Il controllo lo facciamo manualmente:

// Facebook
procedure TMiniBrowserFrm.AddToFBList ( aList: Tlist<FBContacts>; aFBcontact: FBContacts );
var
  i: integer;
  found: boolean;
begin
  Found:= false;
  for i := 0 to alist.Count -1 do begin
    if alist[i].FBlink = aFBcontact.FBlink then begin
      Found:= true;
      break;
    end;
  end;
  if Not Found then aList.Add(aFBcontact);
end;

// Linkedin
procedure TMiniBrowserFrm.AddToLKDList ( aLKDcontact: LKDContacts );
var
  i: integer;
  found: boolean;
begin
  Found:= false;
  for i := 0 to LKDlist.Count -1 do begin
    if LKDlist[i].LKDlink = aLKDcontact.LKDlink  then begin
      Found:= true;
      break;
    end;
  end;
  if Not Found then LKDList.Add(aLKDcontact);
end;

Ora dobbiamo prepararci a parsare un html per estrarre i dati. Nel caso di Facebook possiamo trovare i profili in due formati diversi:
        // profile.php?id=234234234&
        // mario.rossi?
e per scrollare la pagina chiamiamo JavaScript:

 chromium1.Browser.MainFrame.ExecuteJavaScript('document.body.scrollTop = document.body.scrollHeight;','about:blank',0);

mentre per Linkedin dobbiamo simulare uno scroll su uno specifico elemento: 

  chromium1.Browser.MainFrame.ExecuteJavaScript('document.getElementsByClassName("entity-all")[0].scrollTop += 2000;','about:blank',0);






procedure TMiniBrowserFrm.ParseAll (aList: Tlist<FBContacts> );
var
  Lazy, Start, Stop: integer;
  aFBcontact: FBcontacts;
  label nextScroll,Novalid;
begin

  start:=1;

  while true do begin

    chromium1.Browser.MainFrame.GetSourceProc(CallbackGetSource);
    if FbProfileDone then exit;
    application.ProcessMessages ;
    makedelay(1000);
    application.ProcessMessages ;
    Start:=1;
    Stop:=0;

    memo1.Lines.Add('Received bytes: ' + IntToStr (Length(aString)) );
    Lazy:= pos ( 'requireLazy(["ix"]', aString, 1);
    if Lazy = 0 then exit;                            // no facebook friends

    while true do begin

      Start:= pos ( 'lfloat _ohe" href="https://www.facebook.com/', aString, Start);
      if Start = 0 then goto NextScroll; // break

      Start := Start + length('lfloat _ohe" href="');
      Stop :=  pos ( '?', aString, Start);
      if Stop > Start then begin

        // profile.php?id=234234234&
        // mario.rossi?

        if MidStr(aString ,Stop -11 , 11) = 'profile.php' then
        begin
          Stop :=  pos ( '&', aString, Start);
          aFBcontact.FBlink := MidStr (aString, Start , Stop - Start );
        end
        else
          aFBcontact.FBlink := MidStr (aString, Start , Stop - Start );

        if (pos ('/photos' ,aFBcontact.FBlink,1) > 0) then goto noValid;  // break;
        Start:= pos ('alt="" aria-label="', aString, Stop );  // <-- un po' più in là
        if Start > Stop then begin
          Start := Start + length('alt="" aria-label="');
          Stop :=  pos ( '"', aString, Start);
          if Stop > Start then begin
            aFBcontact.FBname := MidStr (aString, Start , Stop - Start );
            AddToFBList ( aList, aFBcontact );

            Start := Stop;
          end;
        end;
      end;
novalid:
    Start:= Stop;
    end;

NextScroll:
  chromium1.Browser.MainFrame.ExecuteJavaScript('document.body.scrollTop = document.body.scrollHeight;','about:blank',0);
  makedelay(1000);
  application.ProcessMessages ;

 end;

end;


procedure TMiniBrowserFrm.ParseLinkedin ;
var
  ProfileList, Start, Stop, tmp: integer;
  aLKDcontact: LKDcontacts;
  label nextScroll,Novalid;
begin

  start:=1;
//  for I := 0 to  do begin
 while true do begin

    chromium1.Browser.MainFrame.GetSourceProc(CallbackGetSource);
    if LkdProfileDone then exit;
    application.ProcessMessages ;
    makedelay(1000);
    application.ProcessMessages ;
    Start:=1;
    Stop:=0;
    while true do begin
      ProfileList := pos ('class="pv-profile-detail__content ph4 ember-view">', aString, 1);
      if ProfileList = 0 then begin
        ShowMessage ( 'Profile List not found!');
        exit;
      end;

      Start:= pos ( '<li class=" entity-list-item">', aString, Start);
      if Start = 0 then goto NextScroll; // break

      Start := pos ('<a data-control-name="profile_link" href="', astring, Start);
      if Start > 0 then Start := Start + Length ('<a data-control-name="profile_link" href="');

      Stop :=  pos ( '"', aString, Start);
      if Stop > Start then begin

        // <a data-control-name="profile_link" href="/in/mariorossi/" id="ember7134" c

        aLKDcontact.LKDlink := MidStr (aString, Start , Stop - Start );
        // ho trovato il link, cerco il nome poi l'email se presente
//        nd loaded" title="Mario Rossi" alt="Foto di Mario ...
        Start := pos ('title="', astring, Stop);
      if Start > 0 then Start := Start + Length ('title="');
        Stop :=  pos ( '"', aString, Start);
        if Stop > Start then
          aLKDcontact.LKDName := MidStr (aString, Start , Stop - Start )
        else exit;

        // dopo cerco l'email se presente , aprendo la lista di link
        AddToLKDList ( aLKDcontact );
      end;
novalid:
    Start:= Stop;
    end;

NextScroll:

  chromium1.Browser.MainFrame.ExecuteJavaScript('document.getElementsByClassName("entity-all")[0].scrollTop += 2000;','about:blank',0);

  makedelay(2000);
  application.ProcessMessages ;

 end;

end;


Entrambe le procedure chiamano   chromium1.Browser.MainFrame.GetSourceProc(CallbackGetSource);
Questo il cuore del progetto. Qui si riceve l'html. Opzionalmente lo salviamo su disco.


procedure CallbackGetSource(const src: ustring);
var
  mm: TMemoryStream;
begin

  if src <> aString then aString:=src
    else begin
    FbProfileDone:=true;
    LKDProfileDone:=true;
    end;
  mm := TMemoryStream.Create ;
  mm.Write(aString[1], Length(aString) * SizeOf(aString[1]));
  mm.SaveToFile(dir_FBprofiles + 'src.txt') ;
  mm.Free;


end;

La partenza e la gestione del loop principale:

procedure TMiniBrowserFrm.FacebookStart;
var
  i,y: integer;
begin
    aString:='';
    FbProfileDone:= false;

    if MyStart = 1 then begin    // New Start

      fblist.Clear ;
      ParseAll (fblist);

    end;

    if FBlist.Count = 0 then exit;


    if MyStart = 1 then begin    // New Start


      memo1.Lines.Clear ;
      memo1.Lines.Add(FBlist[0].FBlink + ',' + FBlist[0].FBname);   // la prima riga è sempre il subject0
      for I := 1 to FBlist.Count -1 do begin
        if (FBlist[i].FBlink = FBlist[0].FBlink) then continue;
        if (pos ('/photos' ,FBlist[i].FBlink,1) = 0) then
          memo1.Lines.Add(FBlist[i].FBlink + ',' + FBlist[i].FBname);
      end;
      memo1.Lines.SaveToFile(dir_FBprofiles + '0.txt');
   //  memo1.Lines.SaveToFile(dir_FBprofiles + FBlist[0].FBname +'_'  + GetFBId (FBlist[0].FBlink) + '.txt'); // per altri usi
     memo1.Lines.SaveToFile(dir_FBprofiles + FBlist[0].FBname + '.txt'); // per altri usi

    end;

    for I := MyStart to FBlist.Count -1 do begin // 0 è l'inizio di tutto
      urlcbx.Text:='www.google.com';
      chromium1.LoadURL(urlcbx.Text);
      application.ProcessMessages ;
      memo1.Lines.Clear;
      makedelay(3000);

      if pos('profile.php',fblist[i].FBlink ,1) > 0 then
        urlcbx.Text := fblist[i].FBlink + '&sk=friends&source_ref=pb_friends_tl'
        else
          urlcbx.Text:= fblist[i].FBlink + '/friends';

      chromium1.LoadURL(urlcbx.Text);
      application.ProcessMessages ;
      makedelay(3000);
      chromium1.Browser.MainFrame.ExecuteJavaScript('document.body.scrollTop = 0;','about:blank',0);
      application.ProcessMessages ;
      makedelay(2000);
      if AppTerminate then exit;
      fblist2.clear;
      aString:='';
      FbProfileDone:= false;
      parseall(fblist2);
      memo1.Lines.Clear ;
      memo1.Lines.Add(FBlist[i].FBlink + ',' + FBlist[i].FBname);   // la prima riga è sempre il subject0
      for y := 1 to FBlist2.Count -1 do begin
        if (FBlist2[y].FBlink = FBlist2[0].FBlink) then continue;
        if (pos ('/photos' ,FBlist2[y].FBlink,1) = 0) then
          memo1.Lines.Add(FBlist2[y].FBlink + ',' + FBlist2[y].FBname);
      end;
     // memo1.Lines.SaveToFile(dir_FBprofiles + FBlist[i].FBname +'_'  + GetFBId (FBlist[i].FBlink) + '.txt'); // per altri usi
     memo1.Lines.SaveToFile(dir_FBprofiles + FBlist[i].FBname + '.txt'); // per altri usi

    end;


end;





procedure TMiniBrowserFrm.Button1Click(Sender: TObject);
var
  MyFile: TextFile;
  i,x: integer;
  aFBContact: FBContacts;
  Str: string;
begin
  if Not FileExists (dir_FBprofiles + '0.txt') then begin
    ShowMessage ( 'File 0.txt not found!');
    exit;
  end;
  AssignFile(myFile,dir_FBprofiles + '0.txt');
  reset(Myfile);
  while not Eof(myFile) do begin
    ReadLn(myFile, Str);
    x:= pos ( ',', Str,1);
    aFBContact.FBlink := leftstr ( Str, x -1);
    aFBContact.FBName := Rightstr ( Str, length(Str) - x );
    AddToFBList ( fblist, aFBContact);
  end;
  CloseFile(myFile);

  Stringgrid1.RowCount := fblist.Count ;
  StringGrid1.ColWidths [0]:= 160;
  StringGrid1.ColWidths [1]:= 260;

  for i := 0 to fblist.Count -1 do begin
    stringgrid1.Cells [0,i]:= fblist[i].FBName ;
    stringgrid1.Cells [1,i]:= fblist[i].FBLink ;
  end;



  Panel2.Visible := true;
end;




procedure TMiniBrowserFrm.Button3Click(Sender: TObject);
var
  i: integer;
begin

  application.ProcessMessages ;
  LKDProfileDone:= false;
  LKDList.Clear ;
  ParseLinkeDin;

  memo1.Lines.Clear ;
  for I := 0 to LKDlist.Count -1 do begin
     memo1.Lines.Add(LKDlist[i].LKDlink + ',' + LKDlist[i].LKDName);
  end;
  memo1.Lines.SaveToFile(dir_LKDprofiles + 'LKD0.txt');
//  memo1.Lines.SaveToFile(dir_FBprofiles + ''+ FBlist[i].FBname +'.txt'); // per altri usi


  // Qui cerco le email e le assegno
  // e per ogni elemento della lkdlist vado all link e simulo il click a
  // chromium1.Browser.MainFrame.ExecuteJavaScript('document.getElementsByClassName(''contact-see-more-less link-without-visited-state'')[0].click();','about:blank',0);


end;





procedure TMiniBrowserFrm.Button4Click(Sender: TObject);
var
  MyFile: TextFile;
  i,x, Start, Stop: integer;
  Str: string;
  aLKDContact : LKDContacts;
  label Try_twitter;
begin
  // questo è il seguito della linkedin get emails
  LKDList.Clear ;
  AssignFile(myFile,dir_LKDprofiles + 'LKD0.txt');
  reset(Myfile);
  while not Eof(myFile) do begin
    ReadLn(myFile, Str);
    x:= pos ( ',', Str,1);
    aLKDContact.LKDlink := leftstr ( Str, x -1);
    aLKDContact.LKDName := Rightstr ( Str, length(Str) - x );
    AddToLKDList (aLKDContact);
  end;
  CloseFile(myFile);

  // visito pagina per pagina e estraggo l'email
  memo1.Lines.Clear ;
  for i := 0 to LKDList.count - 1 do begin

      urlcbx.Text:='www.linkedin.com' + LKDList[i].LKDlink  ;
      chromium1.LoadURL(urlcbx.Text);
      application.ProcessMessages ;
      makedelay(3000);
      chromium1.Browser.MainFrame.ExecuteJavaScript('document.querySelectorAll(''[data-control-name="contact_see_more"]'')[0].click();','about:blank',0);

      chromium1.Browser.MainFrame.GetSourceProc(CallbackGetSource);
      application.ProcessMessages ;

      aLKDContact := LKDList[i];
      Start:= pos ( '<section class="pv-contact-info__contact-type ci-email">', aString, 1);
      if Start = 0 then goto try_Twitter;
      Start := pos ('<span class="pv-contact-info__contact-item Sans-15px-black-55%">', aString, Start);
      if Start = 0 then goto try_Twitter;
      Start := Start + length ('<span class="pv-contact-info__contact-item Sans-15px-black-55%">');
      if Start = 0 then goto try_Twitter;
      Stop := pos ('</span>', aString, Start);
      if Stop = 0 then goto try_Twitter;

      aLKDContact.LKDEmail := MidStr (aString, Start , Stop - Start );
try_Twitter:
      Start:= pos ( 'pv-contact-info__contact-type ci-twitter', aString, 1);
      if Start = 0 then continue;
      Start := pos ('<a target="_blank" href="', aString, Start);
      if Start = 0 then continue;
      Start := Start + length ('<a target="_blank" href="');
      if Start = 0 then continue;
      Stop := pos ('"', aString, Start);
      if Stop = 0 then exit;

      aLKDContact.LKDTwitter := MidStr (aString, Start , Stop - Start );


      LKDList[i] := aLKDContact;
      memo1.Lines.Add(LKDList[i].LKDlink + ',' + LKDlist[i].LKDname + ',' + LKDlist[i].LKDemail + ',' + LKDlist[i].LKDtwitter );
  end;

  // salvo il file
    memo1.Lines.SaveToFile(dir_LKDprofiles + 'LDK0withEmail.txt');

end;


procedure TMiniBrowserFrm.MakeDelay(msecs: integer);
var
  FirstTickCount: longint;
begin
  FirstTickCount := GetTickCount;
   repeat
     Application.ProcessMessages;
   until ((GetTickCount-FirstTickCount) >= Longint(msecs));
end;


E' molto utile modificare l'interazione con Javascript per scoprire con quali oggetti potere interagire:


procedure TMiniBrowserFrm.Chromium1ContextMenuCommand(Sender: TObject;
  const browser: ICefBrowser; const frame: ICefFrame;
  const params: ICefContextMenuParams; commandId: Integer;
  eventFlags: TCefEventFlags; out Result: Boolean);
var
  TempParam : WParam;
  TempFactory: ICefSchemeHandlerFactory;
begin
  Result := False;

  case commandId of
    MINIBROWSER_CONTEXTMENU_HIDEDEVTOOLS :
      PostMessage(Handle, MINIBROWSER_HIDEDEVTOOLS, 0, 0);

    MINIBROWSER_CONTEXTMENU_SHOWDEVTOOLS :
      begin
        TempParam := ((params.XCoord and $FFFF) shl 16) or (params.YCoord and $FFFF);
        PostMessage(Handle, MINIBROWSER_SHOWDEVTOOLS, TempParam, 0);
      end;

    MINIBROWSER_CONTEXTMENU_SHOWJSALERT :
      if (browser <> nil) and (browser.MainFrame <> nil) then
        browser.MainFrame.ExecuteJavaScript('alert(''JavaScript execute works!'');', 'about:blank', 0);

    MINIBROWSER_CONTEXTMENU_SETJSEVENT :
      if (browser <> nil) and (browser.MainFrame <> nil) then
        browser.MainFrame.ExecuteJavaScript(
          'document.body.addEventListener("mouseover", function(evt){'+
          //  'function getpath(n){'+
          //    'var ret = "<" + n.nodeName + ">";'+
          //    'if (n.parentNode){return getpath(n.parentNode) + ret} else '+
          //    'return ret'+
          //  '};'+
//            'myextension.mouseover(getpath(evt.target))}'+

            'function getClass(n){'+
              'var ret = evt.target.getAttribute("class");'+
              'return ret'+
            '};'+

            'myextension.mouseover(getClass(evt.target))}'+
          ')', 'about:blank', 0);

    MINIBROWSER_CONTEXTMENU_COPYHTML :
      PostMessage(Handle, MINIBROWSER_COPYHTML, 0, 0);

    MINIBROWSER_CONTEXTMENU_VISITDOM :
      PostMessage(Handle, MINIBROWSER_VISITDOM, 0, 0);

    MINIBROWSER_CONTEXTMENU_JSWRITEDOC :
      if (browser <> nil) and (browser.MainFrame <> nil) then
        browser.MainFrame.ExecuteJavaScript(
          'var css = ' + chr(39) + '@page {size: A4; margin: 0;} @media print {html, body {width: 210mm; height: 297mm;}}' + chr(39) + '; ' +
          'var style = document.createElement(' + chr(39) + 'style' + chr(39) + '); ' +
          'style.type = ' + chr(39) + 'text/css' + chr(39) + '; ' +
          'style.appendChild(document.createTextNode(css)); ' +
          'document.head.appendChild(style);',
          'about:blank', 0);

    MINIBROWSER_CONTEXTMENU_JSPRINTDOC :
      if (browser <> nil) and (browser.MainFrame <> nil) then
        browser.MainFrame.ExecuteJavaScript('window.print();', 'about:blank', 0);

    MINIBROWSER_CONTEXTMENU_REGSCHEME :
      if (browser <> nil) and
         (browser.host <> nil) and
         (browser.host.RequestContext <> nil) then
        begin
          // You can register the Scheme Handler Factory in the DPR file or later, for example in a context menu command.
          TempFactory := TCefSchemeHandlerFactoryOwn.Create(THelloScheme);
          if not(browser.host.RequestContext.RegisterSchemeHandlerFactory('hello', '', TempFactory)) then
            MessageDlg('RegisterSchemeHandlerFactory error !', mtError, [mbOk], 0);
        end;

    MINIBROWSER_CONTEXTMENU_CLEARFACT :
      if (browser <> nil) and
         (browser.host <> nil) and
         (browser.host.RequestContext <> nil) then
        begin
          if not(browser.host.RequestContext.ClearSchemeHandlerFactories) then
            MessageDlg('ClearSchemeHandlerFactories error !', mtError, [mbOk], 0);
        end;
  end;
end;








Commenti

Post più popolari