seleniumuser.seleniumuser
1import atexit 2import os 3import random 4import sys 5import time 6from pathlib import Path 7from types import LambdaType 8from typing import Any 9from warnings import warn 10 11from bs4 import BeautifulSoup 12from selenium import webdriver 13from selenium.webdriver.chrome.options import Options as ChromeOptions 14from selenium.webdriver.chrome.service import Service as ChromeService 15from selenium.webdriver.common.by import By 16from selenium.webdriver.common.keys import Keys 17from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 18from selenium.webdriver.firefox.options import Options as FirefoxOptions 19from selenium.webdriver.firefox.service import Service as FirefoxService 20from selenium.webdriver.remote.webelement import WebElement 21from selenium.webdriver.support.ui import Select 22 23from noiftimer import Timer 24from voxscribe import get_text_from_url 25from whosyouragent import get_agent 26 27 28class User: 29 """Sits on top of selenium to streamline 30 automation and scraping tasks.""" 31 32 def __init__( 33 self, 34 headless: bool = False, 35 browser_type: str = "firefox", 36 implicit_wait: int = 10, 37 page_load_timeout: int = 60, 38 open_browser: bool = True, 39 locator_method: str = "xpath", 40 randomize_user_agent: bool = True, 41 user_agent_rotation_period: int = None, 42 move_window_by: tuple[int, int] = (0, -1000), 43 download_dir: str | Path = None, 44 driver_path: str | Path = None, 45 ): 46 """ 47 :param headless: If True, browser window will not be visible. 48 49 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 50 51 :param implicit_wait: Number of seconds to look for a specified element before 52 selenium considers it missing and throws an exception. 53 54 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 55 before throwing an exception. 56 57 :param open_browser: If True, opens a browser window when a User object is created. 58 If False, a manual call to self.open_browser() must be made. 59 60 :param locator_method: The locator type User should expect to be given. 61 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 62 Every member function with a 'locator' argument refers to a string matching 63 the current locator_method. 64 65 :param randomize_user_agent: If True, a random useragent will be used whenever 66 the browser is opened. If False, the native useragent will be used. 67 68 :param user_agent_rotation_period: If not None, the browser window will be closed 69 and reopened with a new useragent every user_agent_rotation_period number of minutes. 70 Rotation occurs on the first call to self.get() after the time period has elapsed. 71 Ignored if randomize_user_agent is False. 72 73 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 74 75 :param download_dir: The download folder to use. If None, the default folder will be used. 76 77 :param driver_path: The path to the webdriver executable selenium should use. 78 If None, the system PATH will be checked for the executable. 79 If the executable isn't found, the parent directories and the immediate child directories 80 of the current working directory will be searched. 81 """ 82 self.headless = headless 83 browser_type = browser_type.lower() 84 if browser_type in ["firefox", "chrome"]: 85 self.browser_type = browser_type 86 else: 87 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 88 self.browser_open = False 89 self.implicit_wait = implicit_wait 90 self.page_load_timeout = page_load_timeout 91 self.rotation_timer = Timer() 92 self.randomize_user_agent = randomize_user_agent 93 self.user_agent_rotation_period = user_agent_rotation_period 94 self.locator_method = locator_method 95 self.turbo() 96 self.keys = Keys 97 self.move_window_by = move_window_by 98 self.download_dir = download_dir 99 self.driver_path = driver_path 100 if not self.driver_path: 101 self.search_for_driver() 102 if open_browser: 103 self.open_browser() 104 else: 105 self.browser = None 106 atexit.register(self.close_browser) 107 108 def __enter__(self): 109 return self 110 111 def __exit__(self, *args): 112 self.close_browser() 113 114 def configure_firefox(self) -> FirefoxService: 115 """Configure options and profile for firefox.""" 116 self.options = FirefoxOptions() 117 self.options.headless = self.headless 118 self.options.set_preference( 119 "widget.windows.window_occlusion_tracking.enabled", False 120 ) 121 self.options.set_preference("dom.webaudio.enabled", False) 122 if self.randomize_user_agent: 123 self.options.set_preference("general.useragent.override", get_agent()) 124 if self.download_dir: 125 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 126 self.profile = FirefoxProfile() 127 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 128 self.profile.set_preference("browser.download.folderList", 2) 129 else: 130 self.profile = None 131 self.service = FirefoxService( 132 executable_path=str(self.driver_path), log_path=os.devnull 133 ) 134 135 def configure_chrome(self) -> ChromeService: 136 """Configure options and profile for chrome.""" 137 self.options = ChromeOptions() 138 self.options.headless = self.headless 139 self.options.add_argument("--disable-blink-features=AutomationControlled") 140 self.options.add_argument("--mute-audio") 141 self.options.add_argument("--disable-infobars") 142 self.options.add_argument("--disable-notifications") 143 self.options.add_argument("--log-level=3") 144 if self.randomize_user_agent: 145 self.options.add_argument(f"--user-agent={get_agent()}") 146 self.options.add_experimental_option("useAutomationExtension", False) 147 if self.download_dir: 148 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 149 self.options.add_experimental_option( 150 "prefs", {"download.default_directory": str(self.download_dir)} 151 ) 152 self.service = ChromeService( 153 executable_path=str(self.driver_path), log_path=os.devnull 154 ) 155 156 def search_for_driver(self): 157 """Searches for the webdriver executable.""" 158 cwd = Path.cwd() 159 found = False 160 match self.browser_type: 161 case "firefox": 162 driver = "geckodriver.exe" 163 case "chrome": 164 driver = "chromedriver.exe" 165 # search PATH 166 env_path = os.environ["PATH"] 167 if sys.platform == "win32": 168 env_paths = env_path.split(";") 169 else: 170 env_paths = env_path.split(":") 171 driver = driver[: driver.find(".")] 172 for path in env_paths: 173 if (Path(path) / driver).exists(): 174 self.driver_path = Path(path) / driver 175 found = True 176 break 177 # check current working directory and parent folders 178 if not found: 179 while cwd != cwd.parent: 180 if (cwd / driver).exists(): 181 self.driver_path = cwd / driver 182 found = True 183 break 184 cwd = cwd.parent 185 # check top most level 186 if not found and (cwd / driver).exists(): 187 self.driver_path = cwd / driver 188 found = True 189 # check child folders (only 1 level down) 190 if not found: 191 for child in Path.cwd().iterdir(): 192 if child.is_dir() and (child / driver).exists(): 193 self.driver_path = child / driver 194 found = True 195 if not found: 196 warn(f"Could not find {driver}") 197 198 def set_implicit_wait(self, wait_time: int = None): 199 """Sets to default time if no arg given.""" 200 if not wait_time: 201 self.browser.implicitly_wait(self.implicit_wait) 202 else: 203 self.browser.implicitly_wait(wait_time) 204 205 def open_browser(self): 206 """Configures and opens selenium browser.""" 207 if not self.browser_open: 208 match self.browser_type: 209 case "firefox": 210 self.configure_firefox() 211 self.browser = webdriver.Firefox( 212 options=self.options, 213 service=self.service, 214 firefox_profile=self.profile, 215 ) 216 case "chrome": 217 self.configure_chrome() 218 self.browser = webdriver.Chrome( 219 options=self.options, service=self.service 220 ) 221 self.set_implicit_wait() 222 self.browser.maximize_window() 223 self.browser.set_window_position( 224 self.move_window_by[0], self.move_window_by[1] 225 ) 226 self.browser.maximize_window() 227 self.browser.set_page_load_timeout(self.page_load_timeout) 228 self.browser_open = True 229 self.tab_index = 0 230 self.rotation_timer.start() 231 else: 232 warn("Browser already open.") 233 234 def close_browser(self): 235 """Close browser window.""" 236 if self.browser_open: 237 self.browser_open = False 238 self.browser.quit() 239 240 def open_tab(self, url: str = "", switch_to_tab: bool = True): 241 """Opens new tab and, if provided, goes to url. 242 243 New tab is inserted after currently active tab.""" 244 self.script("window.open(arguments[0]);", url) 245 if switch_to_tab: 246 self.switch_to_tab(self.tab_index + 1) 247 248 def switch_to_tab(self, tab_index: int): 249 """Switch to a tab in browser, zero indexed.""" 250 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 251 self.tab_index = tab_index 252 253 def get_num_tabs(self) -> int: 254 """Returns number of tabs open.""" 255 return len(self.browser.window_handles) 256 257 def close_tab(self, tab_index: int = 1): 258 """Close specified tab and 259 switches to tab index 0.""" 260 self.switch_to_tab(tab_index) 261 self.browser.close() 262 self.switch_to_tab(0) 263 264 def get(self, url: str): 265 """Requests webpage at given url and rotates userAgent if necessary.""" 266 if not self.browser_open: 267 self.open_browser() 268 if ( 269 self.randomize_user_agent 270 and self.user_agent_rotation_period is not None 271 and self.rotation_timer.elapsed > (60 * self.user_agent_rotation_period) 272 ): 273 self.rotation_timer.stop() 274 self.close_browser() 275 self.open_browser() 276 self.browser.get(url) 277 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 278 self.chill(self.arrival_wait) 279 280 def get_soup(self) -> BeautifulSoup: 281 """Returns a BeautifulSoup object 282 of the current page source.""" 283 return BeautifulSoup(self.browser.page_source, "html.parser") 284 285 def current_url(self) -> str: 286 """Returns current url of active tab.""" 287 return self.browser.current_url 288 289 def delete_cookies(self): 290 """Delete all cookies for 291 this browser instance.""" 292 self.browser.delete_all_cookies() 293 294 def turbo(self, engage: bool = True): 295 """When engaged, strings will be sent 296 to elements all at once and there will be 297 no waiting after actions. 298 299 When disengaged, strings will be sent to elements 300 'one key at a time' with randomized amounts of 301 time between successive keys and after actions.""" 302 if engage: 303 self.after_key_wait = (0, 0) 304 self.after_field_wait = (0, 0) 305 self.after_click_wait = (0, 0) 306 self.arrival_wait = (1, 1) 307 self.one_key_at_a_time = False 308 self.turbo_engaged = True 309 else: 310 self.after_key_wait = (0.1, 0.5) 311 self.after_field_wait = (1, 2) 312 self.after_click_wait = (0.25, 1.5) 313 self.arrival_wait = (4, 10) 314 self.one_key_at_a_time = True 315 self.turbo_engaged = False 316 317 def chill(self, min_max: tuple[float, float]): 318 """Sleeps a random amount 319 between min_max[0] and min_max[1].""" 320 time.sleep(random.uniform(min_max[0], min_max[1])) 321 322 def script(self, script: str, args: Any = None) -> Any: 323 """Execute javascript code and returns result.""" 324 return self.browser.execute_script(script, args) 325 326 def remove(self, locator: str): 327 """Removes element from DOM.""" 328 self.script("arguments[0].remove();", self.find(locator)) 329 330 def get_length(self, locator: str) -> int: 331 """Returns number of child elements for a given element.""" 332 return int(self.script("return arguments[0].length;", self.find(locator))) 333 334 def find(self, locator: str) -> WebElement: 335 """Finds and returns a WebElement.""" 336 match self.locator_method: 337 case "xpath": 338 return self.browser.find_element(By.XPATH, locator) 339 case "id": 340 return self.browser.find_element(By.ID, locator) 341 case "className": 342 return self.browser.find_element(By.CLASS_NAME, locator) 343 case "name": 344 return self.browser.find_element(By.NAME, locator) 345 case "cssSelector": 346 return self.browser.find_element(By.CSS_SELECTOR, locator) 347 348 def find_children(self, locator: str) -> list[WebElement]: 349 """Returns a list of child WebElements 350 for given locator arg.""" 351 element = self.find(locator) 352 return element.find_elements("xpath", "./*") 353 354 def scroll(self, amount: int = None, fraction: float = None): 355 """Scroll web page. 356 :param amount: The number of lines to scroll if not None. 357 358 :param fraction: The amount between 0.0 and 1.0 359 of the page height to scroll. 360 361 If values are provided for both arguments, 362 amount will be used. 363 364 If values are provided for neither argument, 365 the entire page length will be scrolled. 366 367 Scrolls one line at a time if self.turbo is False.""" 368 if amount: 369 amount_to_scroll = amount 370 elif fraction: 371 amount_to_scroll = int( 372 fraction 373 * ( 374 int(self.script("return document.body.scrollHeight;")) 375 - int(self.script("return window.pageYOffset;")) 376 ) 377 ) 378 else: 379 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 380 if self.turbo_engaged: 381 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 382 else: 383 for _ in range(abs(amount_to_scroll)): 384 if amount_to_scroll >= 0: 385 self.script("window.scrollBy(0,1);") 386 else: 387 self.script("window.scrollBy(0,-1);") 388 self.chill(self.after_click_wait) 389 390 def scroll_into_view(self, locator: str) -> WebElement: 391 """Scrolls to a given element and returns the element.""" 392 element = self.find(locator) 393 self.script("arguments[0].scroll_into_view();", element) 394 self.chill(self.after_click_wait) 395 return element 396 397 def text(self, locator: str) -> str: 398 """Returns text of WebElement.""" 399 return self.find(locator).text 400 401 def click(self, locator: str) -> WebElement: 402 """Clicks on and returns WebElement.""" 403 element = self.find(locator) 404 element.click() 405 self.chill(self.after_click_wait) 406 return element 407 408 def clear(self, locator: str) -> WebElement: 409 """Clears content of WebElement if able 410 and then returns WebElement.""" 411 element = self.find(locator) 412 element.clear() 413 self.chill(self.after_click_wait) 414 return element 415 416 def switch_to_iframe(self, locator: str): 417 """Switch to an iframe from given locator.""" 418 self.browser.switch_to.frame(self.find(locator)) 419 420 def switch_to_parent_frame(self): 421 """Move up a frame level from current frame.""" 422 self.browser.switch_to.parent_frame() 423 424 def select( 425 self, locator: str, method: str, choice: str | int | tuple 426 ) -> WebElement: 427 """Select a choice from Select element. 428 Returns the Select element from the locator string, 429 not the option element that is selected. 430 431 :param method: Can be 'value' or 'index' 432 433 :param choice: The option to select. 434 435 If method is 'value', then choice should be 436 the html 'value' attribute of the desired option. 437 438 If method is 'index', choice can either be a single 439 int for the desired option or it can be a two-tuple. 440 If the tuple is provided, a random option between the 441 two indicies (inclusive) will be selected.""" 442 element = self.click(locator) 443 match method: 444 case "value": 445 Select(element).select_by_value(choice) 446 case "index": 447 if type(choice) == tuple: 448 choice = random.randint(choice[0], choice[1]) 449 Select(element).select_by_index(choice) 450 self.chill(self.after_field_wait) 451 return element 452 453 def click_elements( 454 self, locators: list[str], max_selections: int = None, min_selections: int = 1 455 ) -> WebElement: 456 """Click a random number of WebElements 457 and return the last WebElement clicked. 458 459 :param locators: A list of element locators to choose from. 460 461 :param max_selections: The maximum number of elements to click. 462 If None, the maximum will be the length of the locators list. 463 464 :param min_selections: The minimum number of elements to click. 465 466 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 467 will click between 1 and 3 random elements from the list. 468 """ 469 if not max_selections: 470 max_selections = len(locators) 471 for option in random.sample( 472 locators, k=random.randint(min_selections, max_selections) 473 ): 474 element = self.click(option) 475 return element 476 477 def get_click_list( 478 self, num_options: int, max_choices: int = 1, min_choices: int = 1 479 ) -> list[str]: 480 """Similar to self.click_elements(), but for use with the self.fill_next() method. 481 482 Creates a list of length 'num_options' where every element is 'skip'. 483 484 A random number of elements in the list between 'min_choices' and 'max_choices' are 485 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 486 click_list = ["skip"] * num_options 487 selected_indexes = [] 488 for i in range(random.randint(min_choices, max_choices)): 489 index = random.randint(0, num_options - 1) 490 while index in selected_indexes: 491 index = random.randint(0, num_options - 1) 492 selected_indexes.append(index) 493 click_list[index] = self.keys.SPACE 494 return click_list 495 496 def send_keys( 497 self, 498 locator: str, 499 data: str, 500 click_first: bool = True, 501 clear_first: bool = False, 502 ) -> WebElement: 503 """Types data into element and returns the element. 504 505 :param data: The string to send to the element. 506 507 :param click_first: If True, the element is clicked on 508 before the data is sent. 509 510 :param clear_first: If True, the current text of the element 511 is cleared before the data is sent.""" 512 element = self.click(locator) if click_first else self.find(locator) 513 if clear_first: 514 element.clear() 515 self.chill(self.after_click_wait) 516 if self.one_key_at_a_time: 517 for ch in str(data): 518 element.send_keys(ch) 519 self.chill(self.after_key_wait) 520 else: 521 element.send_keys(str(data)) 522 self.chill(self.after_field_wait) 523 return element 524 525 def fill_next( 526 self, data: list[str | tuple], start_element: WebElement = None 527 ) -> WebElement: 528 """Fills a form by tabbing from the current WebElement 529 to the next one and using the corresponding item in data. 530 Returns the last WebElement. 531 532 :param data: A list of form data. If an item is a string (except for 'skip') 533 it will be typed into the current WebElement. 534 535 An item in data can be a two-tuple of the form 536 ('downArrow', numberOfPresses:int|tuple[int, int]). 537 538 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 539 that many times to the WebElement. 540 541 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 542 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 543 This is typically for use with Select elements. 544 545 An item in data can also be 'skip', which will perform no action on the current 546 WebElement and will continue to the next one. 547 548 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 549 representing a percent chance an element will be clicked or skipped: 550 >>> user.fill_next(["click=70"]) 551 552 has a 70% chance of being 553 >>> user.fill_next([user.keys.SPACE]) 554 555 and a 30% chance of being 556 >>> user.fill_next(["skip"]) 557 558 559 :param start_element: The WebElement to start tabbing from. 560 The currently active element will be used if start_element is None. 561 562 Note: The function tabs to the next element before sending data, 563 so the start_element should the WebElement before the one 564 that should receive data[0]. 565 """ 566 element = ( 567 self.browser.switch_to.active_element 568 if not start_element 569 else start_element 570 ) 571 for datum in data: 572 element.send_keys(Keys.TAB) 573 element = self.browser.switch_to.active_element 574 self.chill(self.after_key_wait) 575 if type(datum) == str and datum.strip().startswith("click="): 576 chance = int(datum.split("=")[1].strip()) 577 if random.randint(0, 100) <= chance: 578 datum = Keys.SPACE 579 else: 580 datum = "skip" 581 if datum[0] == "downArrow": 582 if type(datum[1]) == tuple: 583 times = random.randint(datum[1][0], datum[1][1]) 584 else: 585 times = datum[1] 586 for _ in range(times): 587 element.send_keys(Keys.ARROW_DOWN) 588 self.chill(self.after_key_wait) 589 elif datum == "skip": 590 self.chill(self.after_key_wait) 591 else: 592 593 if self.turbo_engaged: 594 element.send_keys(str(datum)) 595 else: 596 for ch in str(datum): 597 element.send_keys(ch) 598 self.chill(self.after_key_wait) 599 self.chill(self.after_field_wait) 600 return element 601 602 def wait_until( 603 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 604 ): 605 """Checks condition repeatedly until either it is true, 606 or the max_wait is exceeded. 607 608 Raises a TimeoutError if the condition doesn't success within max_wait. 609 610 Useful for determing whether a form has been successfully submitted. 611 612 :param condition: The condition function to check. 613 614 :param max_wait: Number of seconds to continue checking condition 615 before throwing a TimeoutError. 616 617 :param polling_interval: The number of seconds to sleep before 618 checking the condition function again after it fails. 619 620 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 621 start_time = time.time() 622 while True: 623 try: 624 if condition(): 625 time.sleep(1) 626 break 627 elif (time.time() - start_time) > max_wait: 628 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 629 else: 630 time.sleep(polling_interval) 631 except: 632 if (time.time() - start_time) > max_wait: 633 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 634 else: 635 time.sleep(polling_interval) 636 637 def dismiss_alert(self): 638 """Dismiss alert dialog.""" 639 self.browser.switch_to.alert.dismiss() 640 641 def solve_recaptcha_v3( 642 self, 643 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 644 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 645 ): 646 """Pass google recaptcha v3 by solving an audio puzzle. 647 648 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 649 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 650 pass None to this argument. 651 652 """ 653 locator_method = self.locator_method 654 self.locator_method = "xpath" 655 try: 656 if outer_iframe_xpath: 657 self.switch_to_iframe(outer_iframe_xpath) 658 self.click('//*[@id="recaptcha-anchor"]') 659 self.switch_to_parent_frame() 660 self.switch_to_iframe(inner_iframe_xpath) 661 self.click('//*[@id="recaptcha-audio-button"]') 662 mp3_url = self.find( 663 '//a[@class="rc-audiochallenge-tdownload-link"]' 664 ).get_attribute("href") 665 text = get_text_from_url(mp3_url, ".mp3") 666 self.send_keys('//*[@id="audio-response"]', text) 667 self.click('//*[@id="recaptcha-verify-button"]') 668 except Exception as e: 669 print(e) 670 raise Exception("Could not solve captcha") 671 finally: 672 self.switch_to_parent_frame() 673 self.locator_method = locator_method
29class User: 30 """Sits on top of selenium to streamline 31 automation and scraping tasks.""" 32 33 def __init__( 34 self, 35 headless: bool = False, 36 browser_type: str = "firefox", 37 implicit_wait: int = 10, 38 page_load_timeout: int = 60, 39 open_browser: bool = True, 40 locator_method: str = "xpath", 41 randomize_user_agent: bool = True, 42 user_agent_rotation_period: int = None, 43 move_window_by: tuple[int, int] = (0, -1000), 44 download_dir: str | Path = None, 45 driver_path: str | Path = None, 46 ): 47 """ 48 :param headless: If True, browser window will not be visible. 49 50 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 51 52 :param implicit_wait: Number of seconds to look for a specified element before 53 selenium considers it missing and throws an exception. 54 55 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 56 before throwing an exception. 57 58 :param open_browser: If True, opens a browser window when a User object is created. 59 If False, a manual call to self.open_browser() must be made. 60 61 :param locator_method: The locator type User should expect to be given. 62 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 63 Every member function with a 'locator' argument refers to a string matching 64 the current locator_method. 65 66 :param randomize_user_agent: If True, a random useragent will be used whenever 67 the browser is opened. If False, the native useragent will be used. 68 69 :param user_agent_rotation_period: If not None, the browser window will be closed 70 and reopened with a new useragent every user_agent_rotation_period number of minutes. 71 Rotation occurs on the first call to self.get() after the time period has elapsed. 72 Ignored if randomize_user_agent is False. 73 74 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 75 76 :param download_dir: The download folder to use. If None, the default folder will be used. 77 78 :param driver_path: The path to the webdriver executable selenium should use. 79 If None, the system PATH will be checked for the executable. 80 If the executable isn't found, the parent directories and the immediate child directories 81 of the current working directory will be searched. 82 """ 83 self.headless = headless 84 browser_type = browser_type.lower() 85 if browser_type in ["firefox", "chrome"]: 86 self.browser_type = browser_type 87 else: 88 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 89 self.browser_open = False 90 self.implicit_wait = implicit_wait 91 self.page_load_timeout = page_load_timeout 92 self.rotation_timer = Timer() 93 self.randomize_user_agent = randomize_user_agent 94 self.user_agent_rotation_period = user_agent_rotation_period 95 self.locator_method = locator_method 96 self.turbo() 97 self.keys = Keys 98 self.move_window_by = move_window_by 99 self.download_dir = download_dir 100 self.driver_path = driver_path 101 if not self.driver_path: 102 self.search_for_driver() 103 if open_browser: 104 self.open_browser() 105 else: 106 self.browser = None 107 atexit.register(self.close_browser) 108 109 def __enter__(self): 110 return self 111 112 def __exit__(self, *args): 113 self.close_browser() 114 115 def configure_firefox(self) -> FirefoxService: 116 """Configure options and profile for firefox.""" 117 self.options = FirefoxOptions() 118 self.options.headless = self.headless 119 self.options.set_preference( 120 "widget.windows.window_occlusion_tracking.enabled", False 121 ) 122 self.options.set_preference("dom.webaudio.enabled", False) 123 if self.randomize_user_agent: 124 self.options.set_preference("general.useragent.override", get_agent()) 125 if self.download_dir: 126 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 127 self.profile = FirefoxProfile() 128 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 129 self.profile.set_preference("browser.download.folderList", 2) 130 else: 131 self.profile = None 132 self.service = FirefoxService( 133 executable_path=str(self.driver_path), log_path=os.devnull 134 ) 135 136 def configure_chrome(self) -> ChromeService: 137 """Configure options and profile for chrome.""" 138 self.options = ChromeOptions() 139 self.options.headless = self.headless 140 self.options.add_argument("--disable-blink-features=AutomationControlled") 141 self.options.add_argument("--mute-audio") 142 self.options.add_argument("--disable-infobars") 143 self.options.add_argument("--disable-notifications") 144 self.options.add_argument("--log-level=3") 145 if self.randomize_user_agent: 146 self.options.add_argument(f"--user-agent={get_agent()}") 147 self.options.add_experimental_option("useAutomationExtension", False) 148 if self.download_dir: 149 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 150 self.options.add_experimental_option( 151 "prefs", {"download.default_directory": str(self.download_dir)} 152 ) 153 self.service = ChromeService( 154 executable_path=str(self.driver_path), log_path=os.devnull 155 ) 156 157 def search_for_driver(self): 158 """Searches for the webdriver executable.""" 159 cwd = Path.cwd() 160 found = False 161 match self.browser_type: 162 case "firefox": 163 driver = "geckodriver.exe" 164 case "chrome": 165 driver = "chromedriver.exe" 166 # search PATH 167 env_path = os.environ["PATH"] 168 if sys.platform == "win32": 169 env_paths = env_path.split(";") 170 else: 171 env_paths = env_path.split(":") 172 driver = driver[: driver.find(".")] 173 for path in env_paths: 174 if (Path(path) / driver).exists(): 175 self.driver_path = Path(path) / driver 176 found = True 177 break 178 # check current working directory and parent folders 179 if not found: 180 while cwd != cwd.parent: 181 if (cwd / driver).exists(): 182 self.driver_path = cwd / driver 183 found = True 184 break 185 cwd = cwd.parent 186 # check top most level 187 if not found and (cwd / driver).exists(): 188 self.driver_path = cwd / driver 189 found = True 190 # check child folders (only 1 level down) 191 if not found: 192 for child in Path.cwd().iterdir(): 193 if child.is_dir() and (child / driver).exists(): 194 self.driver_path = child / driver 195 found = True 196 if not found: 197 warn(f"Could not find {driver}") 198 199 def set_implicit_wait(self, wait_time: int = None): 200 """Sets to default time if no arg given.""" 201 if not wait_time: 202 self.browser.implicitly_wait(self.implicit_wait) 203 else: 204 self.browser.implicitly_wait(wait_time) 205 206 def open_browser(self): 207 """Configures and opens selenium browser.""" 208 if not self.browser_open: 209 match self.browser_type: 210 case "firefox": 211 self.configure_firefox() 212 self.browser = webdriver.Firefox( 213 options=self.options, 214 service=self.service, 215 firefox_profile=self.profile, 216 ) 217 case "chrome": 218 self.configure_chrome() 219 self.browser = webdriver.Chrome( 220 options=self.options, service=self.service 221 ) 222 self.set_implicit_wait() 223 self.browser.maximize_window() 224 self.browser.set_window_position( 225 self.move_window_by[0], self.move_window_by[1] 226 ) 227 self.browser.maximize_window() 228 self.browser.set_page_load_timeout(self.page_load_timeout) 229 self.browser_open = True 230 self.tab_index = 0 231 self.rotation_timer.start() 232 else: 233 warn("Browser already open.") 234 235 def close_browser(self): 236 """Close browser window.""" 237 if self.browser_open: 238 self.browser_open = False 239 self.browser.quit() 240 241 def open_tab(self, url: str = "", switch_to_tab: bool = True): 242 """Opens new tab and, if provided, goes to url. 243 244 New tab is inserted after currently active tab.""" 245 self.script("window.open(arguments[0]);", url) 246 if switch_to_tab: 247 self.switch_to_tab(self.tab_index + 1) 248 249 def switch_to_tab(self, tab_index: int): 250 """Switch to a tab in browser, zero indexed.""" 251 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 252 self.tab_index = tab_index 253 254 def get_num_tabs(self) -> int: 255 """Returns number of tabs open.""" 256 return len(self.browser.window_handles) 257 258 def close_tab(self, tab_index: int = 1): 259 """Close specified tab and 260 switches to tab index 0.""" 261 self.switch_to_tab(tab_index) 262 self.browser.close() 263 self.switch_to_tab(0) 264 265 def get(self, url: str): 266 """Requests webpage at given url and rotates userAgent if necessary.""" 267 if not self.browser_open: 268 self.open_browser() 269 if ( 270 self.randomize_user_agent 271 and self.user_agent_rotation_period is not None 272 and self.rotation_timer.elapsed > (60 * self.user_agent_rotation_period) 273 ): 274 self.rotation_timer.stop() 275 self.close_browser() 276 self.open_browser() 277 self.browser.get(url) 278 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 279 self.chill(self.arrival_wait) 280 281 def get_soup(self) -> BeautifulSoup: 282 """Returns a BeautifulSoup object 283 of the current page source.""" 284 return BeautifulSoup(self.browser.page_source, "html.parser") 285 286 def current_url(self) -> str: 287 """Returns current url of active tab.""" 288 return self.browser.current_url 289 290 def delete_cookies(self): 291 """Delete all cookies for 292 this browser instance.""" 293 self.browser.delete_all_cookies() 294 295 def turbo(self, engage: bool = True): 296 """When engaged, strings will be sent 297 to elements all at once and there will be 298 no waiting after actions. 299 300 When disengaged, strings will be sent to elements 301 'one key at a time' with randomized amounts of 302 time between successive keys and after actions.""" 303 if engage: 304 self.after_key_wait = (0, 0) 305 self.after_field_wait = (0, 0) 306 self.after_click_wait = (0, 0) 307 self.arrival_wait = (1, 1) 308 self.one_key_at_a_time = False 309 self.turbo_engaged = True 310 else: 311 self.after_key_wait = (0.1, 0.5) 312 self.after_field_wait = (1, 2) 313 self.after_click_wait = (0.25, 1.5) 314 self.arrival_wait = (4, 10) 315 self.one_key_at_a_time = True 316 self.turbo_engaged = False 317 318 def chill(self, min_max: tuple[float, float]): 319 """Sleeps a random amount 320 between min_max[0] and min_max[1].""" 321 time.sleep(random.uniform(min_max[0], min_max[1])) 322 323 def script(self, script: str, args: Any = None) -> Any: 324 """Execute javascript code and returns result.""" 325 return self.browser.execute_script(script, args) 326 327 def remove(self, locator: str): 328 """Removes element from DOM.""" 329 self.script("arguments[0].remove();", self.find(locator)) 330 331 def get_length(self, locator: str) -> int: 332 """Returns number of child elements for a given element.""" 333 return int(self.script("return arguments[0].length;", self.find(locator))) 334 335 def find(self, locator: str) -> WebElement: 336 """Finds and returns a WebElement.""" 337 match self.locator_method: 338 case "xpath": 339 return self.browser.find_element(By.XPATH, locator) 340 case "id": 341 return self.browser.find_element(By.ID, locator) 342 case "className": 343 return self.browser.find_element(By.CLASS_NAME, locator) 344 case "name": 345 return self.browser.find_element(By.NAME, locator) 346 case "cssSelector": 347 return self.browser.find_element(By.CSS_SELECTOR, locator) 348 349 def find_children(self, locator: str) -> list[WebElement]: 350 """Returns a list of child WebElements 351 for given locator arg.""" 352 element = self.find(locator) 353 return element.find_elements("xpath", "./*") 354 355 def scroll(self, amount: int = None, fraction: float = None): 356 """Scroll web page. 357 :param amount: The number of lines to scroll if not None. 358 359 :param fraction: The amount between 0.0 and 1.0 360 of the page height to scroll. 361 362 If values are provided for both arguments, 363 amount will be used. 364 365 If values are provided for neither argument, 366 the entire page length will be scrolled. 367 368 Scrolls one line at a time if self.turbo is False.""" 369 if amount: 370 amount_to_scroll = amount 371 elif fraction: 372 amount_to_scroll = int( 373 fraction 374 * ( 375 int(self.script("return document.body.scrollHeight;")) 376 - int(self.script("return window.pageYOffset;")) 377 ) 378 ) 379 else: 380 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 381 if self.turbo_engaged: 382 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 383 else: 384 for _ in range(abs(amount_to_scroll)): 385 if amount_to_scroll >= 0: 386 self.script("window.scrollBy(0,1);") 387 else: 388 self.script("window.scrollBy(0,-1);") 389 self.chill(self.after_click_wait) 390 391 def scroll_into_view(self, locator: str) -> WebElement: 392 """Scrolls to a given element and returns the element.""" 393 element = self.find(locator) 394 self.script("arguments[0].scroll_into_view();", element) 395 self.chill(self.after_click_wait) 396 return element 397 398 def text(self, locator: str) -> str: 399 """Returns text of WebElement.""" 400 return self.find(locator).text 401 402 def click(self, locator: str) -> WebElement: 403 """Clicks on and returns WebElement.""" 404 element = self.find(locator) 405 element.click() 406 self.chill(self.after_click_wait) 407 return element 408 409 def clear(self, locator: str) -> WebElement: 410 """Clears content of WebElement if able 411 and then returns WebElement.""" 412 element = self.find(locator) 413 element.clear() 414 self.chill(self.after_click_wait) 415 return element 416 417 def switch_to_iframe(self, locator: str): 418 """Switch to an iframe from given locator.""" 419 self.browser.switch_to.frame(self.find(locator)) 420 421 def switch_to_parent_frame(self): 422 """Move up a frame level from current frame.""" 423 self.browser.switch_to.parent_frame() 424 425 def select( 426 self, locator: str, method: str, choice: str | int | tuple 427 ) -> WebElement: 428 """Select a choice from Select element. 429 Returns the Select element from the locator string, 430 not the option element that is selected. 431 432 :param method: Can be 'value' or 'index' 433 434 :param choice: The option to select. 435 436 If method is 'value', then choice should be 437 the html 'value' attribute of the desired option. 438 439 If method is 'index', choice can either be a single 440 int for the desired option or it can be a two-tuple. 441 If the tuple is provided, a random option between the 442 two indicies (inclusive) will be selected.""" 443 element = self.click(locator) 444 match method: 445 case "value": 446 Select(element).select_by_value(choice) 447 case "index": 448 if type(choice) == tuple: 449 choice = random.randint(choice[0], choice[1]) 450 Select(element).select_by_index(choice) 451 self.chill(self.after_field_wait) 452 return element 453 454 def click_elements( 455 self, locators: list[str], max_selections: int = None, min_selections: int = 1 456 ) -> WebElement: 457 """Click a random number of WebElements 458 and return the last WebElement clicked. 459 460 :param locators: A list of element locators to choose from. 461 462 :param max_selections: The maximum number of elements to click. 463 If None, the maximum will be the length of the locators list. 464 465 :param min_selections: The minimum number of elements to click. 466 467 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 468 will click between 1 and 3 random elements from the list. 469 """ 470 if not max_selections: 471 max_selections = len(locators) 472 for option in random.sample( 473 locators, k=random.randint(min_selections, max_selections) 474 ): 475 element = self.click(option) 476 return element 477 478 def get_click_list( 479 self, num_options: int, max_choices: int = 1, min_choices: int = 1 480 ) -> list[str]: 481 """Similar to self.click_elements(), but for use with the self.fill_next() method. 482 483 Creates a list of length 'num_options' where every element is 'skip'. 484 485 A random number of elements in the list between 'min_choices' and 'max_choices' are 486 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 487 click_list = ["skip"] * num_options 488 selected_indexes = [] 489 for i in range(random.randint(min_choices, max_choices)): 490 index = random.randint(0, num_options - 1) 491 while index in selected_indexes: 492 index = random.randint(0, num_options - 1) 493 selected_indexes.append(index) 494 click_list[index] = self.keys.SPACE 495 return click_list 496 497 def send_keys( 498 self, 499 locator: str, 500 data: str, 501 click_first: bool = True, 502 clear_first: bool = False, 503 ) -> WebElement: 504 """Types data into element and returns the element. 505 506 :param data: The string to send to the element. 507 508 :param click_first: If True, the element is clicked on 509 before the data is sent. 510 511 :param clear_first: If True, the current text of the element 512 is cleared before the data is sent.""" 513 element = self.click(locator) if click_first else self.find(locator) 514 if clear_first: 515 element.clear() 516 self.chill(self.after_click_wait) 517 if self.one_key_at_a_time: 518 for ch in str(data): 519 element.send_keys(ch) 520 self.chill(self.after_key_wait) 521 else: 522 element.send_keys(str(data)) 523 self.chill(self.after_field_wait) 524 return element 525 526 def fill_next( 527 self, data: list[str | tuple], start_element: WebElement = None 528 ) -> WebElement: 529 """Fills a form by tabbing from the current WebElement 530 to the next one and using the corresponding item in data. 531 Returns the last WebElement. 532 533 :param data: A list of form data. If an item is a string (except for 'skip') 534 it will be typed into the current WebElement. 535 536 An item in data can be a two-tuple of the form 537 ('downArrow', numberOfPresses:int|tuple[int, int]). 538 539 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 540 that many times to the WebElement. 541 542 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 543 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 544 This is typically for use with Select elements. 545 546 An item in data can also be 'skip', which will perform no action on the current 547 WebElement and will continue to the next one. 548 549 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 550 representing a percent chance an element will be clicked or skipped: 551 >>> user.fill_next(["click=70"]) 552 553 has a 70% chance of being 554 >>> user.fill_next([user.keys.SPACE]) 555 556 and a 30% chance of being 557 >>> user.fill_next(["skip"]) 558 559 560 :param start_element: The WebElement to start tabbing from. 561 The currently active element will be used if start_element is None. 562 563 Note: The function tabs to the next element before sending data, 564 so the start_element should the WebElement before the one 565 that should receive data[0]. 566 """ 567 element = ( 568 self.browser.switch_to.active_element 569 if not start_element 570 else start_element 571 ) 572 for datum in data: 573 element.send_keys(Keys.TAB) 574 element = self.browser.switch_to.active_element 575 self.chill(self.after_key_wait) 576 if type(datum) == str and datum.strip().startswith("click="): 577 chance = int(datum.split("=")[1].strip()) 578 if random.randint(0, 100) <= chance: 579 datum = Keys.SPACE 580 else: 581 datum = "skip" 582 if datum[0] == "downArrow": 583 if type(datum[1]) == tuple: 584 times = random.randint(datum[1][0], datum[1][1]) 585 else: 586 times = datum[1] 587 for _ in range(times): 588 element.send_keys(Keys.ARROW_DOWN) 589 self.chill(self.after_key_wait) 590 elif datum == "skip": 591 self.chill(self.after_key_wait) 592 else: 593 594 if self.turbo_engaged: 595 element.send_keys(str(datum)) 596 else: 597 for ch in str(datum): 598 element.send_keys(ch) 599 self.chill(self.after_key_wait) 600 self.chill(self.after_field_wait) 601 return element 602 603 def wait_until( 604 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 605 ): 606 """Checks condition repeatedly until either it is true, 607 or the max_wait is exceeded. 608 609 Raises a TimeoutError if the condition doesn't success within max_wait. 610 611 Useful for determing whether a form has been successfully submitted. 612 613 :param condition: The condition function to check. 614 615 :param max_wait: Number of seconds to continue checking condition 616 before throwing a TimeoutError. 617 618 :param polling_interval: The number of seconds to sleep before 619 checking the condition function again after it fails. 620 621 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 622 start_time = time.time() 623 while True: 624 try: 625 if condition(): 626 time.sleep(1) 627 break 628 elif (time.time() - start_time) > max_wait: 629 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 630 else: 631 time.sleep(polling_interval) 632 except: 633 if (time.time() - start_time) > max_wait: 634 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 635 else: 636 time.sleep(polling_interval) 637 638 def dismiss_alert(self): 639 """Dismiss alert dialog.""" 640 self.browser.switch_to.alert.dismiss() 641 642 def solve_recaptcha_v3( 643 self, 644 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 645 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 646 ): 647 """Pass google recaptcha v3 by solving an audio puzzle. 648 649 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 650 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 651 pass None to this argument. 652 653 """ 654 locator_method = self.locator_method 655 self.locator_method = "xpath" 656 try: 657 if outer_iframe_xpath: 658 self.switch_to_iframe(outer_iframe_xpath) 659 self.click('//*[@id="recaptcha-anchor"]') 660 self.switch_to_parent_frame() 661 self.switch_to_iframe(inner_iframe_xpath) 662 self.click('//*[@id="recaptcha-audio-button"]') 663 mp3_url = self.find( 664 '//a[@class="rc-audiochallenge-tdownload-link"]' 665 ).get_attribute("href") 666 text = get_text_from_url(mp3_url, ".mp3") 667 self.send_keys('//*[@id="audio-response"]', text) 668 self.click('//*[@id="recaptcha-verify-button"]') 669 except Exception as e: 670 print(e) 671 raise Exception("Could not solve captcha") 672 finally: 673 self.switch_to_parent_frame() 674 self.locator_method = locator_method
Sits on top of selenium to streamline automation and scraping tasks.
33 def __init__( 34 self, 35 headless: bool = False, 36 browser_type: str = "firefox", 37 implicit_wait: int = 10, 38 page_load_timeout: int = 60, 39 open_browser: bool = True, 40 locator_method: str = "xpath", 41 randomize_user_agent: bool = True, 42 user_agent_rotation_period: int = None, 43 move_window_by: tuple[int, int] = (0, -1000), 44 download_dir: str | Path = None, 45 driver_path: str | Path = None, 46 ): 47 """ 48 :param headless: If True, browser window will not be visible. 49 50 :param browser_type: Which browser to use. Can be 'firefox' or 'chrome'. 51 52 :param implicit_wait: Number of seconds to look for a specified element before 53 selenium considers it missing and throws an exception. 54 55 :param page_load_timeout: Time in seconds for selenium to wait for a page to load 56 before throwing an exception. 57 58 :param open_browser: If True, opens a browser window when a User object is created. 59 If False, a manual call to self.open_browser() must be made. 60 61 :param locator_method: The locator type User should expect to be given. 62 Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. 63 Every member function with a 'locator' argument refers to a string matching 64 the current locator_method. 65 66 :param randomize_user_agent: If True, a random useragent will be used whenever 67 the browser is opened. If False, the native useragent will be used. 68 69 :param user_agent_rotation_period: If not None, the browser window will be closed 70 and reopened with a new useragent every user_agent_rotation_period number of minutes. 71 Rotation occurs on the first call to self.get() after the time period has elapsed. 72 Ignored if randomize_user_agent is False. 73 74 :param move_window_by: The x and y amount of pixels to move the browser window by after opening. 75 76 :param download_dir: The download folder to use. If None, the default folder will be used. 77 78 :param driver_path: The path to the webdriver executable selenium should use. 79 If None, the system PATH will be checked for the executable. 80 If the executable isn't found, the parent directories and the immediate child directories 81 of the current working directory will be searched. 82 """ 83 self.headless = headless 84 browser_type = browser_type.lower() 85 if browser_type in ["firefox", "chrome"]: 86 self.browser_type = browser_type 87 else: 88 raise ValueError("'browser_type' parameter must be 'firefox' or 'chrome'") 89 self.browser_open = False 90 self.implicit_wait = implicit_wait 91 self.page_load_timeout = page_load_timeout 92 self.rotation_timer = Timer() 93 self.randomize_user_agent = randomize_user_agent 94 self.user_agent_rotation_period = user_agent_rotation_period 95 self.locator_method = locator_method 96 self.turbo() 97 self.keys = Keys 98 self.move_window_by = move_window_by 99 self.download_dir = download_dir 100 self.driver_path = driver_path 101 if not self.driver_path: 102 self.search_for_driver() 103 if open_browser: 104 self.open_browser() 105 else: 106 self.browser = None 107 atexit.register(self.close_browser)
Parameters
headless: If True, browser window will not be visible.
browser_type: Which browser to use. Can be 'firefox' or 'chrome'.
implicit_wait: Number of seconds to look for a specified element before selenium considers it missing and throws an exception.
page_load_timeout: Time in seconds for selenium to wait for a page to load before throwing an exception.
open_browser: If True, opens a browser window when a User object is created. If False, a manual call to self.open_browser() must be made.
locator_method: The locator type User should expect to be given. Can be 'xpath', 'id', 'className', 'name', or 'cssSelector'. Every member function with a 'locator' argument refers to a string matching the current locator_method.
randomize_user_agent: If True, a random useragent will be used whenever the browser is opened. If False, the native useragent will be used.
user_agent_rotation_period: If not None, the browser window will be closed and reopened with a new useragent every user_agent_rotation_period number of minutes. Rotation occurs on the first call to self.get() after the time period has elapsed. Ignored if randomize_user_agent is False.
move_window_by: The x and y amount of pixels to move the browser window by after opening.
download_dir: The download folder to use. If None, the default folder will be used.
driver_path: The path to the webdriver executable selenium should use. If None, the system PATH will be checked for the executable. If the executable isn't found, the parent directories and the immediate child directories of the current working directory will be searched.
115 def configure_firefox(self) -> FirefoxService: 116 """Configure options and profile for firefox.""" 117 self.options = FirefoxOptions() 118 self.options.headless = self.headless 119 self.options.set_preference( 120 "widget.windows.window_occlusion_tracking.enabled", False 121 ) 122 self.options.set_preference("dom.webaudio.enabled", False) 123 if self.randomize_user_agent: 124 self.options.set_preference("general.useragent.override", get_agent()) 125 if self.download_dir: 126 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 127 self.profile = FirefoxProfile() 128 self.profile.set_preference("browser.download.dir", str(self.download_dir)) 129 self.profile.set_preference("browser.download.folderList", 2) 130 else: 131 self.profile = None 132 self.service = FirefoxService( 133 executable_path=str(self.driver_path), log_path=os.devnull 134 )
Configure options and profile for firefox.
136 def configure_chrome(self) -> ChromeService: 137 """Configure options and profile for chrome.""" 138 self.options = ChromeOptions() 139 self.options.headless = self.headless 140 self.options.add_argument("--disable-blink-features=AutomationControlled") 141 self.options.add_argument("--mute-audio") 142 self.options.add_argument("--disable-infobars") 143 self.options.add_argument("--disable-notifications") 144 self.options.add_argument("--log-level=3") 145 if self.randomize_user_agent: 146 self.options.add_argument(f"--user-agent={get_agent()}") 147 self.options.add_experimental_option("useAutomationExtension", False) 148 if self.download_dir: 149 Path(self.download_dir).mkdir(parents=True, exist_ok=True) 150 self.options.add_experimental_option( 151 "prefs", {"download.default_directory": str(self.download_dir)} 152 ) 153 self.service = ChromeService( 154 executable_path=str(self.driver_path), log_path=os.devnull 155 )
Configure options and profile for chrome.
157 def search_for_driver(self): 158 """Searches for the webdriver executable.""" 159 cwd = Path.cwd() 160 found = False 161 match self.browser_type: 162 case "firefox": 163 driver = "geckodriver.exe" 164 case "chrome": 165 driver = "chromedriver.exe" 166 # search PATH 167 env_path = os.environ["PATH"] 168 if sys.platform == "win32": 169 env_paths = env_path.split(";") 170 else: 171 env_paths = env_path.split(":") 172 driver = driver[: driver.find(".")] 173 for path in env_paths: 174 if (Path(path) / driver).exists(): 175 self.driver_path = Path(path) / driver 176 found = True 177 break 178 # check current working directory and parent folders 179 if not found: 180 while cwd != cwd.parent: 181 if (cwd / driver).exists(): 182 self.driver_path = cwd / driver 183 found = True 184 break 185 cwd = cwd.parent 186 # check top most level 187 if not found and (cwd / driver).exists(): 188 self.driver_path = cwd / driver 189 found = True 190 # check child folders (only 1 level down) 191 if not found: 192 for child in Path.cwd().iterdir(): 193 if child.is_dir() and (child / driver).exists(): 194 self.driver_path = child / driver 195 found = True 196 if not found: 197 warn(f"Could not find {driver}")
Searches for the webdriver executable.
199 def set_implicit_wait(self, wait_time: int = None): 200 """Sets to default time if no arg given.""" 201 if not wait_time: 202 self.browser.implicitly_wait(self.implicit_wait) 203 else: 204 self.browser.implicitly_wait(wait_time)
Sets to default time if no arg given.
206 def open_browser(self): 207 """Configures and opens selenium browser.""" 208 if not self.browser_open: 209 match self.browser_type: 210 case "firefox": 211 self.configure_firefox() 212 self.browser = webdriver.Firefox( 213 options=self.options, 214 service=self.service, 215 firefox_profile=self.profile, 216 ) 217 case "chrome": 218 self.configure_chrome() 219 self.browser = webdriver.Chrome( 220 options=self.options, service=self.service 221 ) 222 self.set_implicit_wait() 223 self.browser.maximize_window() 224 self.browser.set_window_position( 225 self.move_window_by[0], self.move_window_by[1] 226 ) 227 self.browser.maximize_window() 228 self.browser.set_page_load_timeout(self.page_load_timeout) 229 self.browser_open = True 230 self.tab_index = 0 231 self.rotation_timer.start() 232 else: 233 warn("Browser already open.")
Configures and opens selenium browser.
235 def close_browser(self): 236 """Close browser window.""" 237 if self.browser_open: 238 self.browser_open = False 239 self.browser.quit()
Close browser window.
241 def open_tab(self, url: str = "", switch_to_tab: bool = True): 242 """Opens new tab and, if provided, goes to url. 243 244 New tab is inserted after currently active tab.""" 245 self.script("window.open(arguments[0]);", url) 246 if switch_to_tab: 247 self.switch_to_tab(self.tab_index + 1)
Opens new tab and, if provided, goes to url.
New tab is inserted after currently active tab.
249 def switch_to_tab(self, tab_index: int): 250 """Switch to a tab in browser, zero indexed.""" 251 self.browser.switch_to.window(self.browser.window_handles[tab_index]) 252 self.tab_index = tab_index
Switch to a tab in browser, zero indexed.
254 def get_num_tabs(self) -> int: 255 """Returns number of tabs open.""" 256 return len(self.browser.window_handles)
Returns number of tabs open.
258 def close_tab(self, tab_index: int = 1): 259 """Close specified tab and 260 switches to tab index 0.""" 261 self.switch_to_tab(tab_index) 262 self.browser.close() 263 self.switch_to_tab(0)
Close specified tab and switches to tab index 0.
265 def get(self, url: str): 266 """Requests webpage at given url and rotates userAgent if necessary.""" 267 if not self.browser_open: 268 self.open_browser() 269 if ( 270 self.randomize_user_agent 271 and self.user_agent_rotation_period is not None 272 and self.rotation_timer.elapsed > (60 * self.user_agent_rotation_period) 273 ): 274 self.rotation_timer.stop() 275 self.close_browser() 276 self.open_browser() 277 self.browser.get(url) 278 self.script("Object.defineProperty(navigator, 'webdriver', {get: () => false})") 279 self.chill(self.arrival_wait)
Requests webpage at given url and rotates userAgent if necessary.
281 def get_soup(self) -> BeautifulSoup: 282 """Returns a BeautifulSoup object 283 of the current page source.""" 284 return BeautifulSoup(self.browser.page_source, "html.parser")
Returns a BeautifulSoup object of the current page source.
286 def current_url(self) -> str: 287 """Returns current url of active tab.""" 288 return self.browser.current_url
Returns current url of active tab.
295 def turbo(self, engage: bool = True): 296 """When engaged, strings will be sent 297 to elements all at once and there will be 298 no waiting after actions. 299 300 When disengaged, strings will be sent to elements 301 'one key at a time' with randomized amounts of 302 time between successive keys and after actions.""" 303 if engage: 304 self.after_key_wait = (0, 0) 305 self.after_field_wait = (0, 0) 306 self.after_click_wait = (0, 0) 307 self.arrival_wait = (1, 1) 308 self.one_key_at_a_time = False 309 self.turbo_engaged = True 310 else: 311 self.after_key_wait = (0.1, 0.5) 312 self.after_field_wait = (1, 2) 313 self.after_click_wait = (0.25, 1.5) 314 self.arrival_wait = (4, 10) 315 self.one_key_at_a_time = True 316 self.turbo_engaged = False
When engaged, strings will be sent to elements all at once and there will be no waiting after actions.
When disengaged, strings will be sent to elements 'one key at a time' with randomized amounts of time between successive keys and after actions.
318 def chill(self, min_max: tuple[float, float]): 319 """Sleeps a random amount 320 between min_max[0] and min_max[1].""" 321 time.sleep(random.uniform(min_max[0], min_max[1]))
Sleeps a random amount between min_max[0] and min_max[1].
323 def script(self, script: str, args: Any = None) -> Any: 324 """Execute javascript code and returns result.""" 325 return self.browser.execute_script(script, args)
Execute javascript code and returns result.
327 def remove(self, locator: str): 328 """Removes element from DOM.""" 329 self.script("arguments[0].remove();", self.find(locator))
Removes element from DOM.
331 def get_length(self, locator: str) -> int: 332 """Returns number of child elements for a given element.""" 333 return int(self.script("return arguments[0].length;", self.find(locator)))
Returns number of child elements for a given element.
335 def find(self, locator: str) -> WebElement: 336 """Finds and returns a WebElement.""" 337 match self.locator_method: 338 case "xpath": 339 return self.browser.find_element(By.XPATH, locator) 340 case "id": 341 return self.browser.find_element(By.ID, locator) 342 case "className": 343 return self.browser.find_element(By.CLASS_NAME, locator) 344 case "name": 345 return self.browser.find_element(By.NAME, locator) 346 case "cssSelector": 347 return self.browser.find_element(By.CSS_SELECTOR, locator)
Finds and returns a WebElement.
349 def find_children(self, locator: str) -> list[WebElement]: 350 """Returns a list of child WebElements 351 for given locator arg.""" 352 element = self.find(locator) 353 return element.find_elements("xpath", "./*")
Returns a list of child WebElements for given locator arg.
355 def scroll(self, amount: int = None, fraction: float = None): 356 """Scroll web page. 357 :param amount: The number of lines to scroll if not None. 358 359 :param fraction: The amount between 0.0 and 1.0 360 of the page height to scroll. 361 362 If values are provided for both arguments, 363 amount will be used. 364 365 If values are provided for neither argument, 366 the entire page length will be scrolled. 367 368 Scrolls one line at a time if self.turbo is False.""" 369 if amount: 370 amount_to_scroll = amount 371 elif fraction: 372 amount_to_scroll = int( 373 fraction 374 * ( 375 int(self.script("return document.body.scrollHeight;")) 376 - int(self.script("return window.pageYOffset;")) 377 ) 378 ) 379 else: 380 amount_to_scroll = int(self.script("return document.body.scrollHeight;")) 381 if self.turbo_engaged: 382 self.script("window.scrollBy(0,arguments[0]);", amount_to_scroll) 383 else: 384 for _ in range(abs(amount_to_scroll)): 385 if amount_to_scroll >= 0: 386 self.script("window.scrollBy(0,1);") 387 else: 388 self.script("window.scrollBy(0,-1);") 389 self.chill(self.after_click_wait)
Scroll web page.
Parameters
amount: The number of lines to scroll if not None.
fraction: The amount between 0.0 and 1.0 of the page height to scroll.
If values are provided for both arguments, amount will be used.
If values are provided for neither argument, the entire page length will be scrolled.
Scrolls one line at a time if self.turbo is False.
391 def scroll_into_view(self, locator: str) -> WebElement: 392 """Scrolls to a given element and returns the element.""" 393 element = self.find(locator) 394 self.script("arguments[0].scroll_into_view();", element) 395 self.chill(self.after_click_wait) 396 return element
Scrolls to a given element and returns the element.
398 def text(self, locator: str) -> str: 399 """Returns text of WebElement.""" 400 return self.find(locator).text
Returns text of WebElement.
402 def click(self, locator: str) -> WebElement: 403 """Clicks on and returns WebElement.""" 404 element = self.find(locator) 405 element.click() 406 self.chill(self.after_click_wait) 407 return element
Clicks on and returns WebElement.
409 def clear(self, locator: str) -> WebElement: 410 """Clears content of WebElement if able 411 and then returns WebElement.""" 412 element = self.find(locator) 413 element.clear() 414 self.chill(self.after_click_wait) 415 return element
Clears content of WebElement if able and then returns WebElement.
417 def switch_to_iframe(self, locator: str): 418 """Switch to an iframe from given locator.""" 419 self.browser.switch_to.frame(self.find(locator))
Switch to an iframe from given locator.
421 def switch_to_parent_frame(self): 422 """Move up a frame level from current frame.""" 423 self.browser.switch_to.parent_frame()
Move up a frame level from current frame.
425 def select( 426 self, locator: str, method: str, choice: str | int | tuple 427 ) -> WebElement: 428 """Select a choice from Select element. 429 Returns the Select element from the locator string, 430 not the option element that is selected. 431 432 :param method: Can be 'value' or 'index' 433 434 :param choice: The option to select. 435 436 If method is 'value', then choice should be 437 the html 'value' attribute of the desired option. 438 439 If method is 'index', choice can either be a single 440 int for the desired option or it can be a two-tuple. 441 If the tuple is provided, a random option between the 442 two indicies (inclusive) will be selected.""" 443 element = self.click(locator) 444 match method: 445 case "value": 446 Select(element).select_by_value(choice) 447 case "index": 448 if type(choice) == tuple: 449 choice = random.randint(choice[0], choice[1]) 450 Select(element).select_by_index(choice) 451 self.chill(self.after_field_wait) 452 return element
Select a choice from Select element. Returns the Select element from the locator string, not the option element that is selected.
Parameters
method: Can be 'value' or 'index'
choice: The option to select.
If method is 'value', then choice should be the html 'value' attribute of the desired option.
If method is 'index', choice can either be a single int for the desired option or it can be a two-tuple. If the tuple is provided, a random option between the two indicies (inclusive) will be selected.
454 def click_elements( 455 self, locators: list[str], max_selections: int = None, min_selections: int = 1 456 ) -> WebElement: 457 """Click a random number of WebElements 458 and return the last WebElement clicked. 459 460 :param locators: A list of element locators to choose from. 461 462 :param max_selections: The maximum number of elements to click. 463 If None, the maximum will be the length of the locators list. 464 465 :param min_selections: The minimum number of elements to click. 466 467 e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) 468 will click between 1 and 3 random elements from the list. 469 """ 470 if not max_selections: 471 max_selections = len(locators) 472 for option in random.sample( 473 locators, k=random.randint(min_selections, max_selections) 474 ): 475 element = self.click(option) 476 return element
Click a random number of WebElements and return the last WebElement clicked.
Parameters
locators: A list of element locators to choose from.
max_selections: The maximum number of elements to click. If None, the maximum will be the length of the locators list.
min_selections: The minimum number of elements to click.
e.g. self.click_elements([xpath1, xpath2, xpath3, xpath4], max_selections=3) will click between 1 and 3 random elements from the list.
478 def get_click_list( 479 self, num_options: int, max_choices: int = 1, min_choices: int = 1 480 ) -> list[str]: 481 """Similar to self.click_elements(), but for use with the self.fill_next() method. 482 483 Creates a list of length 'num_options' where every element is 'skip'. 484 485 A random number of elements in the list between 'min_choices' and 'max_choices' are 486 replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).""" 487 click_list = ["skip"] * num_options 488 selected_indexes = [] 489 for i in range(random.randint(min_choices, max_choices)): 490 index = random.randint(0, num_options - 1) 491 while index in selected_indexes: 492 index = random.randint(0, num_options - 1) 493 selected_indexes.append(index) 494 click_list[index] = self.keys.SPACE 495 return click_list
Similar to self.click_elements(), but for use with the self.fill_next() method.
Creates a list of length 'num_options' where every element is 'skip'.
A random number of elements in the list between 'min_choices' and 'max_choices' are replaced with 'keys.SPACE' (interpreted as a click by almost all web forms).
497 def send_keys( 498 self, 499 locator: str, 500 data: str, 501 click_first: bool = True, 502 clear_first: bool = False, 503 ) -> WebElement: 504 """Types data into element and returns the element. 505 506 :param data: The string to send to the element. 507 508 :param click_first: If True, the element is clicked on 509 before the data is sent. 510 511 :param clear_first: If True, the current text of the element 512 is cleared before the data is sent.""" 513 element = self.click(locator) if click_first else self.find(locator) 514 if clear_first: 515 element.clear() 516 self.chill(self.after_click_wait) 517 if self.one_key_at_a_time: 518 for ch in str(data): 519 element.send_keys(ch) 520 self.chill(self.after_key_wait) 521 else: 522 element.send_keys(str(data)) 523 self.chill(self.after_field_wait) 524 return element
Types data into element and returns the element.
Parameters
data: The string to send to the element.
click_first: If True, the element is clicked on before the data is sent.
clear_first: If True, the current text of the element is cleared before the data is sent.
526 def fill_next( 527 self, data: list[str | tuple], start_element: WebElement = None 528 ) -> WebElement: 529 """Fills a form by tabbing from the current WebElement 530 to the next one and using the corresponding item in data. 531 Returns the last WebElement. 532 533 :param data: A list of form data. If an item is a string (except for 'skip') 534 it will be typed into the current WebElement. 535 536 An item in data can be a two-tuple of the form 537 ('downArrow', numberOfPresses:int|tuple[int, int]). 538 539 If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent 540 that many times to the WebElement. 541 542 If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random 543 number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. 544 This is typically for use with Select elements. 545 546 An item in data can also be 'skip', which will perform no action on the current 547 WebElement and will continue to the next one. 548 549 An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, 550 representing a percent chance an element will be clicked or skipped: 551 >>> user.fill_next(["click=70"]) 552 553 has a 70% chance of being 554 >>> user.fill_next([user.keys.SPACE]) 555 556 and a 30% chance of being 557 >>> user.fill_next(["skip"]) 558 559 560 :param start_element: The WebElement to start tabbing from. 561 The currently active element will be used if start_element is None. 562 563 Note: The function tabs to the next element before sending data, 564 so the start_element should the WebElement before the one 565 that should receive data[0]. 566 """ 567 element = ( 568 self.browser.switch_to.active_element 569 if not start_element 570 else start_element 571 ) 572 for datum in data: 573 element.send_keys(Keys.TAB) 574 element = self.browser.switch_to.active_element 575 self.chill(self.after_key_wait) 576 if type(datum) == str and datum.strip().startswith("click="): 577 chance = int(datum.split("=")[1].strip()) 578 if random.randint(0, 100) <= chance: 579 datum = Keys.SPACE 580 else: 581 datum = "skip" 582 if datum[0] == "downArrow": 583 if type(datum[1]) == tuple: 584 times = random.randint(datum[1][0], datum[1][1]) 585 else: 586 times = datum[1] 587 for _ in range(times): 588 element.send_keys(Keys.ARROW_DOWN) 589 self.chill(self.after_key_wait) 590 elif datum == "skip": 591 self.chill(self.after_key_wait) 592 else: 593 594 if self.turbo_engaged: 595 element.send_keys(str(datum)) 596 else: 597 for ch in str(datum): 598 element.send_keys(ch) 599 self.chill(self.after_key_wait) 600 self.chill(self.after_field_wait) 601 return element
Fills a form by tabbing from the current WebElement to the next one and using the corresponding item in data. Returns the last WebElement.
Parameters
- data: A list of form data. If an item is a string (except for 'skip') it will be typed into the current WebElement.
An item in data can be a two-tuple of the form ('downArrow', numberOfPresses:int|tuple[int, int]).
If numberOfPresses is a single int, Keys.ARROW_DOWN will be sent that many times to the WebElement.
If numberOfPresses is a tuple, Keys.ARROW_DOWN will be sent a random number of times between numberOfPresses[0] and numberOfPresses[1] inclusive. This is typically for use with Select elements.
An item in data can also be 'skip', which will perform no action on the current WebElement and will continue to the next one.
An item in data can also be 'click=n', where 'n' is an integer b/t 0 and 100, representing a percent chance an element will be clicked or skipped:
>>> user.fill_next(["click=70"])
has a 70% chance of being
>>> user.fill_next([user.keys.SPACE])
and a 30% chance of being
>>> user.fill_next(["skip"])
- start_element: The WebElement to start tabbing from. The currently active element will be used if start_element is None.
Note: The function tabs to the next element before sending data, so the start_element should the WebElement before the one that should receive data[0].
603 def wait_until( 604 self, condition: LambdaType, max_wait: float = 10, polling_interval: float = 0.1 605 ): 606 """Checks condition repeatedly until either it is true, 607 or the max_wait is exceeded. 608 609 Raises a TimeoutError if the condition doesn't success within max_wait. 610 611 Useful for determing whether a form has been successfully submitted. 612 613 :param condition: The condition function to check. 614 615 :param max_wait: Number of seconds to continue checking condition 616 before throwing a TimeoutError. 617 618 :param polling_interval: The number of seconds to sleep before 619 checking the condition function again after it fails. 620 621 e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))""" 622 start_time = time.time() 623 while True: 624 try: 625 if condition(): 626 time.sleep(1) 627 break 628 elif (time.time() - start_time) > max_wait: 629 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 630 else: 631 time.sleep(polling_interval) 632 except: 633 if (time.time() - start_time) > max_wait: 634 raise TimeoutError(f"max_wait exceeded in wait_until({condition})") 635 else: 636 time.sleep(polling_interval)
Checks condition repeatedly until either it is true, or the max_wait is exceeded.
Raises a TimeoutError if the condition doesn't success within max_wait.
Useful for determing whether a form has been successfully submitted.
Parameters
condition: The condition function to check.
max_wait: Number of seconds to continue checking condition before throwing a TimeoutError.
polling_interval: The number of seconds to sleep before checking the condition function again after it fails.
e.g. self.wait_until(lambda: 'Successfully Submitted' in self.text('//p[@id="form-output"]))
638 def dismiss_alert(self): 639 """Dismiss alert dialog.""" 640 self.browser.switch_to.alert.dismiss()
Dismiss alert dialog.
642 def solve_recaptcha_v3( 643 self, 644 outer_iframe_xpath: str = '//iframe[@title="reCAPTCHA"]', 645 inner_iframe_xpath: str = '//iframe[@title="recaptcha challenge expires in two minutes"]', 646 ): 647 """Pass google recaptcha v3 by solving an audio puzzle. 648 649 :param outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. 650 If it's the recaptcha without the initial checkbox that just shows the image puzzle, 651 pass None to this argument. 652 653 """ 654 locator_method = self.locator_method 655 self.locator_method = "xpath" 656 try: 657 if outer_iframe_xpath: 658 self.switch_to_iframe(outer_iframe_xpath) 659 self.click('//*[@id="recaptcha-anchor"]') 660 self.switch_to_parent_frame() 661 self.switch_to_iframe(inner_iframe_xpath) 662 self.click('//*[@id="recaptcha-audio-button"]') 663 mp3_url = self.find( 664 '//a[@class="rc-audiochallenge-tdownload-link"]' 665 ).get_attribute("href") 666 text = get_text_from_url(mp3_url, ".mp3") 667 self.send_keys('//*[@id="audio-response"]', text) 668 self.click('//*[@id="recaptcha-verify-button"]') 669 except Exception as e: 670 print(e) 671 raise Exception("Could not solve captcha") 672 finally: 673 self.switch_to_parent_frame() 674 self.locator_method = locator_method
Pass google recaptcha v3 by solving an audio puzzle.
Parameters
- outer_iframe_xpath: Xpath to the iframe containing the recaptcha checkbox. If it's the recaptcha without the initial checkbox that just shows the image puzzle, pass None to this argument.