Coverage for src/django_audit_log/models.py: 69%

284 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-02 11:48 +0700

1""" 

2Log information about requests 

3This is mostly taken from the request 

4and intended to be used with the "AccessLogMixin" 

5""" 

6 

7from typing import Dict, Optional, Any, NamedTuple 

8from urllib.parse import urlparse 

9import random 

10import re 

11 

12# Django imports 

13from django.conf import settings 

14from django.db import models 

15from django.http.request import HttpRequest 

16from django.http.response import HttpResponse 

17from django.urls import Resolver404, resolve 

18from django.urls.resolvers import ResolverMatch 

19 

20# Third-party imports (if any) 

21try: 

22 from sentry_sdk import capture_exception 

23except ImportError: 

24 # Fallback if sentry_sdk is not installed 

25 def capture_exception(exception): 

26 if settings.DEBUG: 

27 raise exception 

28 

29 

30class LogPath(models.Model): 

31 """ 

32 Mostly for deduplication of URLS, keeps the Path, Referrer, or response URL (ie redirection from a POST) 

33 """ 

34 

35 path = models.CharField(max_length=4096, null=False, blank=True, editable=False) 

36 

37 class Meta: 

38 verbose_name = "Log Path" 

39 verbose_name_plural = "Log Paths" 

40 indexes = [ 

41 models.Index(fields=["path"]), 

42 ] 

43 

44 @staticmethod 

45 def normalize_path(url: str) -> str: 

46 """ 

47 Normalize a URL by removing method, server, and port information. 

48 Also handles relative URLs. 

49 

50 Args: 

51 url: The URL to normalize 

52 

53 Returns: 

54 str: The normalized path 

55 """ 

56 if not url: 

57 return "" 

58 

59 # Parse the URL 

60 parsed = urlparse(url) 

61 

62 # If it's already just a path (no scheme/netloc), return it cleaned 

63 if not parsed.scheme and not parsed.netloc: 

64 return parsed.path 

65 

66 # Return just the path component 

67 return parsed.path 

68 

69 @classmethod 

70 def from_request(cls, request: HttpRequest) -> "LogPath": 

71 """ 

72 Create or get a LogPath instance from a request path. 

73 

74 Args: 

75 request: The HTTP request object 

76 

77 Returns: 

78 LogPath: The LogPath instance for the request path 

79 """ 

80 normalized_path = cls.normalize_path(request.path) 

81 return cls.objects.get_or_create(path=normalized_path)[0] 

82 

83 @classmethod 

84 def from_referrer(cls, request: HttpRequest) -> Optional["LogPath"]: 

85 """ 

86 Create or get a LogPath instance from a request referrer. 

87 

88 Args: 

89 request: The HTTP request object 

90 

91 Returns: 

92 Optional[LogPath]: The LogPath instance for the referrer or None if no referrer 

93 """ 

94 referrer = request.META.get("HTTP_REFERER") 

95 if not referrer: 

96 return None 

97 

98 try: 

99 normalized_path = cls.normalize_path(referrer) 

100 return cls.objects.get_or_create(path=normalized_path)[0] 

101 except cls.MultipleObjectsReturned: 

102 # Log this situation as it indicates data inconsistency 

103 if settings.DEBUG: 

104 print(f"Multiple LogPath objects found for referrer: {referrer}") 

105 return cls.objects.filter(path=cls.normalize_path(referrer)).first() 

106 

107 @classmethod 

108 def from_response(cls, response: Optional[HttpResponse]) -> Optional["LogPath"]: 

109 """ 

110 Create or get a LogPath instance from a response URL. 

111 

112 Args: 

113 response: The HTTP response object 

114 

115 Returns: 

116 Optional[LogPath]: The LogPath instance for the response URL or None if no URL 

117 """ 

118 if response is None: 

119 return None 

120 

121 try: 

122 normalized_path = cls.normalize_path(response.url) 

123 return cls.objects.get_or_create(path=normalized_path)[0] 

124 except AttributeError: 

125 return None 

126 

127 def __str__(self) -> str: 

128 """Return a string representation of the LogPath.""" 

129 return self.path 

130 

131 

132class LogSessionKey(models.Model): 

133 """ 

134 Keep the user's session key 

135 Possibly useful to track user interaction over time 

136 """ 

137 

138 key = models.CharField(max_length=1024, null=False, blank=True, editable=False) 

139 

140 class Meta: 

141 verbose_name = "Log Session Key" 

142 verbose_name_plural = "Log Session Keys" 

143 indexes = [ 

144 models.Index(fields=["key"]), 

145 ] 

146 

147 @classmethod 

148 def from_request(cls, request: HttpRequest) -> Optional["LogSessionKey"]: 

149 """ 

150 Create or get a LogSessionKey instance from a request session key. 

151 

152 Args: 

153 request: The HTTP request object 

154 

155 Returns: 

156 Optional[LogSessionKey]: The LogSessionKey instance or None if no session key 

157 """ 

158 key = request.session.session_key 

159 if key: 

160 return cls.objects.get_or_create(key=key)[0] 

161 return None 

162 

163 def __str__(self) -> str: 

164 """Return a truncated string representation of the session key.""" 

165 return f"{self.key[:5]}" 

166 

167 

168class LogUser(models.Model): 

169 """ 

170 Rather than make a foreign-key to User, which may be deleted or changed, 

171 keep a record of the user ID and name 

172 """ 

173 

174 id = models.IntegerField( 

175 primary_key=True, editable=False 

176 ) # Should correspond to a User ID 

177 # This is the username of the first logged request. It should not change but sometimes 

178 # people do fix spelling mistakes etc. 

179 user_name = models.CharField( 

180 max_length=1024, null=False, blank=True, editable=False 

181 ) 

182 

183 class Meta: 

184 verbose_name = "Log User" 

185 verbose_name_plural = "Log Users" 

186 

187 @classmethod 

188 def from_request(cls, request: HttpRequest) -> "LogUser": 

189 """ 

190 Create or get a LogUser instance from a request user. 

191 

192 Args: 

193 request: The HTTP request object 

194 

195 Returns: 

196 LogUser: The LogUser instance 

197 """ 

198 if request.user.is_anonymous: 

199 return cls.objects.get_or_create(id=0, user_name="anonymous")[0] 

200 return cls.objects.get_or_create( 

201 id=request.user.pk, defaults={"user_name": request.user.get_username()} 

202 )[0] 

203 

204 def __str__(self) -> str: 

205 """Return a string representation of the logged user.""" 

206 return f"{self.user_name} ({self.id})" 

207 

208 

209class LogIpAddress(models.Model): 

210 """ 

211 Single field lists IP addresses of users 

212 """ 

213 

214 address = models.GenericIPAddressField(editable=False) 

215 

216 class Meta: 

217 verbose_name = "Log IP Address" 

218 verbose_name_plural = "Log IP Addresses" 

219 indexes = [ 

220 models.Index(fields=["address"]), 

221 ] 

222 

223 @classmethod 

224 def from_request(cls, request: HttpRequest) -> "LogIpAddress": 

225 """ 

226 Create or get a LogIpAddress instance from a request IP address. 

227 

228 Args: 

229 request: The HTTP request object 

230 

231 Returns: 

232 LogIpAddress: The LogIpAddress instance 

233 """ 

234 # Get the IP address, accounting for proxies 

235 if request.META.get("HTTP_X_FORWARDED_FOR"): 

236 ip = request.META.get("HTTP_X_FORWARDED_FOR").split(",")[0].strip() 

237 else: 

238 ip = request.META.get("REMOTE_ADDR") 

239 

240 return cls.objects.get_or_create(address=ip)[0] 

241 

242 def __str__(self) -> str: 

243 """Return a string representation of the IP address.""" 

244 return self.address 

245 

246 

247class AccessLog(models.Model): 

248 """ 

249 Primary model for logging access. You probably want to 

250 use a mixin - see "from_request method" - rather than directly accessing 

251 this. 

252 """ 

253 

254 # The source path, referrer, and response URL (if any) 

255 path = models.ForeignKey( 

256 LogPath, null=True, blank=True, on_delete=models.PROTECT, editable=False 

257 ) 

258 referrer = models.ForeignKey( 

259 LogPath, 

260 null=True, 

261 blank=True, 

262 related_name="refers", 

263 on_delete=models.PROTECT, 

264 editable=False, 

265 ) 

266 response_url = models.ForeignKey( 

267 LogPath, 

268 null=True, 

269 blank=True, 

270 related_name="response", 

271 on_delete=models.PROTECT, 

272 editable=False, 

273 ) 

274 

275 # Request type and content 

276 method = models.CharField(max_length=8, null=False, blank=True, editable=False) 

277 data = models.JSONField(help_text="Payload", editable=False) 

278 status_code = models.IntegerField( 

279 null=True, blank=True, help_text="Response code (200=OK)", editable=False 

280 ) 

281 

282 # User agent information (deprecated field kept for backward compatibility) 

283 user_agent = models.TextField( 

284 null=True, 

285 blank=True, 

286 help_text="User Agent string (deprecated)", 

287 editable=False, 

288 ) 

289 

290 # Foreign key to normalized user agent 

291 user_agent_normalized = models.ForeignKey( 

292 "LogUserAgent", 

293 null=True, 

294 blank=True, 

295 on_delete=models.SET_NULL, 

296 editable=False, 

297 related_name="access_logs", 

298 help_text="Normalized user agent information", 

299 ) 

300 

301 # user details: username, ip address, session 

302 user = models.ForeignKey( 

303 LogUser, null=True, blank=True, on_delete=models.PROTECT, editable=False 

304 ) 

305 session_key = models.ForeignKey( 

306 LogSessionKey, null=True, blank=True, on_delete=models.PROTECT, editable=False 

307 ) 

308 ip = models.ForeignKey( 

309 LogIpAddress, null=True, blank=True, on_delete=models.PROTECT, editable=False 

310 ) 

311 

312 timestamp = models.DateTimeField(auto_now_add=True, db_index=True, editable=False) 

313 

314 # Sampling metadata fields 

315 in_always_log_urls = models.BooleanField( 

316 default=False, 

317 editable=False, 

318 help_text="Whether this URL matched a pattern in AUDIT_LOG_ALWAYS_LOG_URLS", 

319 ) 

320 in_sample_urls = models.BooleanField( 

321 default=False, 

322 editable=False, 

323 help_text="Whether this URL matched a pattern in AUDIT_LOG_SAMPLE_URLS", 

324 ) 

325 sample_rate = models.FloatField( 

326 null=True, 

327 blank=True, 

328 editable=False, 

329 help_text="The AUDIT_LOG_SAMPLE_RATE value when this log was created", 

330 ) 

331 

332 # Define a NamedTuple for sampling results 

333 class SamplingResult(NamedTuple): 

334 """Results from checking if a request should be sampled.""" 

335 

336 should_log: bool 

337 in_always_log_urls: bool 

338 in_sample_urls: bool 

339 sample_rate: float 

340 

341 class Meta: 

342 verbose_name = "Access Log" 

343 verbose_name_plural = "Access Logs" 

344 ordering = ["-timestamp"] 

345 indexes = [ 

346 models.Index(fields=["timestamp"]), 

347 models.Index(fields=["method"]), 

348 models.Index(fields=["status_code"]), 

349 ] 

350 

351 @classmethod 

352 def from_request( 

353 cls, request: HttpRequest, response: Optional[HttpResponse] = None 

354 ) -> Optional["AccessLog"]: 

355 """ 

356 Create an access log entry from a request and optional response. 

357 

358 Args: 

359 request: The HTTP request object 

360 response: Optional HTTP response object 

361 

362 Returns: 

363 Optional[AccessLog]: The created AccessLog instance or None if creation failed 

364 """ 

365 # Get excluded IPs from settings 

366 excluded_ips = getattr(settings, "AUDIT_LOG_EXCLUDED_IPS", ["127.0.0.1"]) 

367 

368 # Check if the request IP is excluded 

369 ip = request.META.get("HTTP_X_FORWARDED_FOR", "").split(",")[0].strip() or request.META.get("REMOTE_ADDR") 

370 if ip in excluded_ips: 

371 return None 

372 

373 # Check if we should log this request based on sampling settings 

374 sampling_info = cls._check_sampling(request) 

375 if not sampling_info.should_log: 

376 return None 

377 

378 def get_data() -> Dict[str, Any]: 

379 """ 

380 Extract cleaned GET and POST data, 

381 excluding "sensitive" fields 

382 

383 Returns: 

384 Dict[str, Any]: Dictionary containing GET and POST data 

385 """ 

386 # Create deepcopies to avoid modifying the original data 

387 post = request.POST.copy() 

388 

389 # Remove sensitive fields 

390 sensitive_fields = ["password", "csrfmiddlewaretoken", "created_by"] 

391 for field in sensitive_fields: 

392 post.pop(field, None) 

393 

394 get = dict(request.GET.copy()) 

395 

396 # Keep things short: drop if there is no GET or POST data 

397 data = {} 

398 if get: 

399 data["get"] = get 

400 if post: 

401 data["post"] = post 

402 return data 

403 

404 # Get and process the user agent string 

405 user_agent_string = request.META.get("HTTP_USER_AGENT", "") 

406 user_agent_obj = None 

407 if user_agent_string: 

408 user_agent_obj = LogUserAgent.from_user_agent_string(user_agent_string) 

409 

410 try: 

411 return cls.objects.create( 

412 # The source path, referrer, and response URL (if any) 

413 path=LogPath.from_request(request), 

414 referrer=LogPath.from_referrer(request), 

415 response_url=LogPath.from_response(response) if response else None, 

416 # Request type and content 

417 method=request.method, 

418 data=get_data(), 

419 status_code=response.status_code if response else None, 

420 # User agent (storing both for backward compatibility) 

421 user_agent=user_agent_string, 

422 user_agent_normalized=user_agent_obj, 

423 # user details: username, ip address, session 

424 user=LogUser.from_request(request), 

425 session_key=LogSessionKey.from_request(request), 

426 ip=LogIpAddress.from_request(request), 

427 # Sampling metadata 

428 in_always_log_urls=sampling_info.in_always_log_urls, 

429 in_sample_urls=sampling_info.in_sample_urls, 

430 sample_rate=sampling_info.sample_rate, 

431 ) 

432 except Exception as e: 

433 if settings.DEBUG: 

434 raise 

435 capture_exception(e) 

436 return None 

437 

438 @classmethod 

439 def _check_sampling(cls, request: HttpRequest) -> "AccessLog.SamplingResult": 

440 """ 

441 Check if a request should be logged based on sampling settings. 

442 

443 Args: 

444 request: The HTTP request object 

445 

446 Returns: 

447 SamplingResult: Named tuple containing sampling information 

448 """ 

449 # Get settings with defaults 

450 sample_rate = getattr(settings, "AUDIT_LOG_SAMPLE_RATE", 1.0) 

451 always_log_urls = getattr(settings, "AUDIT_LOG_ALWAYS_LOG_URLS", []) 

452 sample_urls = getattr(settings, "AUDIT_LOG_SAMPLE_URLS", []) 

453 

454 # If no URL patterns are specified in either list, fall back to sampling all URLs 

455 if not always_log_urls and not sample_urls: 

456 return cls.SamplingResult( 

457 should_log=random.random() < sample_rate, 

458 in_always_log_urls=False, 

459 in_sample_urls=False, 

460 sample_rate=sample_rate, 

461 ) 

462 

463 path = request.path 

464 

465 # First check if the URL should always be logged 

466 for pattern in always_log_urls: 

467 if re.match(pattern, path): 

468 return cls.SamplingResult( 

469 should_log=True, 

470 in_always_log_urls=True, 

471 in_sample_urls=False, 

472 sample_rate=sample_rate, 

473 ) 

474 

475 # Then check if the URL should be sampled 

476 for pattern in sample_urls: 

477 if re.match(pattern, path): 

478 return cls.SamplingResult( 

479 should_log=random.random() < sample_rate, 

480 in_always_log_urls=False, 

481 in_sample_urls=True, 

482 sample_rate=sample_rate, 

483 ) 

484 

485 # URLs not in either list are never logged 

486 return cls.SamplingResult( 

487 should_log=False, 

488 in_always_log_urls=False, 

489 in_sample_urls=False, 

490 sample_rate=sample_rate, 

491 ) 

492 

493 @classmethod 

494 def _should_log_request(cls, request: HttpRequest) -> bool: 

495 """ 

496 Determine if the request should be logged based on sampling settings. 

497 

498 Args: 

499 request: The HTTP request object 

500 

501 Returns: 

502 bool: True if the request should be logged, False otherwise 

503 """ 

504 return cls._check_sampling(request).should_log 

505 

506 def __str__(self) -> str: 

507 """Return a string representation of the AccessLog.""" 

508 status = f" [{self.status_code}]" if self.status_code else "" 

509 return f"{self.method} {self.path}{status} by {self.user} at {self.timestamp.strftime('%Y-%m-%d %H:%M:%S')}" 

510 

511 

512class LogUserAgent(models.Model): 

513 """ 

514 Store user agent strings to avoid duplication in AccessLog. 

515 Also provides pre-parsed categorization of user agents. 

516 """ 

517 

518 user_agent = models.TextField(unique=True, editable=False) 

519 browser = models.CharField(max_length=256, null=True, blank=True, editable=False) 

520 browser_version = models.CharField( 

521 max_length=256, null=True, blank=True, editable=False 

522 ) 

523 operating_system = models.CharField( 

524 max_length=256, null=True, blank=True, editable=False 

525 ) 

526 operating_system_version = models.CharField( 

527 max_length=256, null=True, blank=True, editable=False, 

528 help_text="Version of the operating system if available" 

529 ) 

530 device_type = models.CharField(max_length=256, null=True, blank=True, editable=False) 

531 is_bot = models.BooleanField(default=False, editable=False) 

532 

533 class Meta: 

534 verbose_name = "Log User Agent" 

535 verbose_name_plural = "Log User Agents" 

536 indexes = [ 

537 models.Index(fields=["browser"]), 

538 models.Index(fields=["operating_system"]), 

539 models.Index(fields=["device_type"]), 

540 models.Index(fields=["is_bot"]), 

541 ] 

542 

543 @classmethod 

544 def reimport_all(cls, batch_size=1000): 

545 """ 

546 Reprocess all user agents with current parsing logic. 

547 This is useful when the parsing logic has been updated. 

548 

549 Args: 

550 batch_size: Number of records to process in each batch 

551 

552 Returns: 

553 dict: Summary of reimport results 

554 """ 

555 from django.db import transaction 

556 from django.db.models import Count 

557 

558 # Get all distinct user agents 

559 total_agents = cls.objects.count() 

560 processed = 0 

561 updated = 0 

562 

563 print(f"Found {total_agents} user agents to reprocess") 

564 

565 # Process in batches to avoid memory issues 

566 for i in range(0, total_agents, batch_size): 

567 batch = cls.objects.all()[i:i + batch_size] 

568 

569 with transaction.atomic(): 

570 for agent in batch: 

571 processed += 1 

572 

573 # Parse with current logic 

574 info = UserAgentUtil.normalize_user_agent(agent.user_agent) 

575 

576 # Check if any fields would be updated 

577 needs_update = ( 

578 agent.browser != info["browser"] or 

579 agent.browser_version != info["browser_version"] or 

580 agent.operating_system != info["os"] or 

581 agent.operating_system_version != info["os_version"] or 

582 agent.device_type != info["device_type"] or 

583 agent.is_bot != info["is_bot"] 

584 ) 

585 

586 if needs_update: 

587 agent.browser = info["browser"] 

588 agent.browser_version = info["browser_version"] 

589 agent.operating_system = info["os"] 

590 agent.operating_system_version = info["os_version"] 

591 agent.device_type = info["device_type"] 

592 agent.is_bot = info["is_bot"] 

593 agent.save() 

594 updated += 1 

595 

596 if processed % batch_size == 0 or processed == total_agents: 

597 print(f"Processed {processed}/{total_agents} user agents, updated {updated}") 

598 

599 return { 

600 "total_agents": total_agents, 

601 "processed": processed, 

602 "updated": updated, 

603 } 

604 

605 @classmethod 

606 def from_user_agent_string(cls, user_agent_string): 

607 """ 

608 Create or get a LogUserAgent instance from a user agent string. 

609 Parses and categorizes the user agent during creation. 

610 

611 Args: 

612 user_agent_string: The raw user agent string 

613 

614 Returns: 

615 LogUserAgent: The LogUserAgent instance 

616 """ 

617 if not user_agent_string: 

618 return None 

619 

620 # Try to get existing user agent 

621 try: 

622 return cls.objects.get(user_agent=user_agent_string) 

623 except cls.DoesNotExist: 

624 # Parse user agent 

625 try: 

626 from django_audit_log.admin import UserAgentUtil 

627 

628 info = UserAgentUtil.normalize_user_agent(user_agent_string) 

629 

630 return cls.objects.create( 

631 user_agent=user_agent_string, 

632 browser=info["browser"], 

633 browser_version=info["browser_version"], 

634 operating_system=info["os"], 

635 operating_system_version=info["os_version"], 

636 device_type=info["device_type"], 

637 is_bot=info["is_bot"], 

638 ) 

639 except ImportError: 

640 # If UserAgentUtil is not available, just store the string 

641 return cls.objects.create( 

642 user_agent=user_agent_string, 

643 browser="Unknown", 

644 operating_system="Unknown", 

645 device_type="Unknown", 

646 ) 

647 

648 def __str__(self): 

649 os_version = f" {self.operating_system_version}" if self.operating_system_version else "" 

650 return f"{self.browser} {self.browser_version or ''} on {self.operating_system}{os_version} ({self.device_type})" 

651 

652 

653class UserAgentUtil: 

654 """Utility class for parsing and normalizing user agents.""" 

655 

656 # Browser pattern regex 

657 BROWSER_PATTERNS = [ 

658 (r"tl\.eskola\.eskola_app-(\d+\.\d+\.\d+)-release(?:/(\w+))?", "Eskola APK"), # Non-playstore format 

659 (r"tl\.eskola\.eskola_app\.playstore-(\d+\.\d+\.\d+)-release(?:/(\w+))?", "Eskola APK"), # Playstore format 

660 (r"Chrome/(\d+)", "Chrome"), 

661 (r"Firefox/(\d+)", "Firefox"), 

662 (r"Safari/(\d+)", "Safari"), 

663 (r"Edge/(\d+)", "Edge"), 

664 (r"Edg/(\d+)", "Edge"), # New Edge based on Chromium 

665 (r"MSIE\s(\d+)", "Internet Explorer"), 

666 (r"Trident/.*rv:(\d+)", "Internet Explorer"), 

667 (r"OPR/(\d+)", "Opera"), 

668 (r"Opera/(\d+)", "Opera"), 

669 (r"UCBrowser/(\d+)", "UC Browser"), 

670 (r"SamsungBrowser/(\d+)", "Samsung Browser"), 

671 (r"YaBrowser/(\d+)", "Yandex Browser"), 

672 (r"HeadlessChrome", "Headless Chrome"), 

673 (r"Googlebot", "Googlebot"), 

674 (r"bingbot", "Bingbot"), 

675 (r"DuckDuckBot", "DuckDuckBot"), 

676 (r"Dalvik/(\d+)", "Dalvik"), # Android Runtime Environment 

677 ] 

678 

679 # OS pattern regex 

680 OS_PATTERNS = [ 

681 (r"Windows NT 10\.0", "Windows 10"), 

682 (r"Windows NT 6\.3", "Windows 8.1"), 

683 (r"Windows NT 6\.2", "Windows 8"), 

684 (r"Windows NT 6\.1", "Windows 7"), 

685 (r"Windows NT 6\.0", "Windows Vista"), 

686 (r"Windows NT 5\.1", "Windows XP"), 

687 (r"Windows NT 5\.0", "Windows 2000"), 

688 (r"Macintosh.*Mac OS X", "macOS"), 

689 (r"Android\s+(\d+)", "Android"), # Captures Android version 

690 (r"Linux", "Linux"), 

691 (r"iPhone.*OS\s+(\d+)", "iOS"), 

692 (r"iPad.*OS\s+(\d+)", "iOS"), 

693 (r"iPod.*OS\s+(\d+)", "iOS"), 

694 (r"CrOS", "Chrome OS"), 

695 ] 

696 

697 # Device type patterns 

698 DEVICE_PATTERNS = [ 

699 (r"iPhone", "Mobile"), 

700 (r"iPod", "Mobile"), 

701 (r"iPad", "Tablet"), 

702 (r"Android.*Mobile", "Mobile"), 

703 (r"Android(?!.*Mobile)", "Tablet"), 

704 (r"Mobile", "Mobile"), 

705 (r"Tablet", "Tablet"), 

706 ] 

707 

708 # Bot/crawler patterns 

709 BOT_PATTERNS = [ 

710 (r"bot|crawler|spider|crawl|Googlebot|bingbot|yahoo|slurp|ahref|semrush|baidu|DigitalOcean|Palo Alto Networks|Expanse", "Bot/Crawler"), 

711 ] 

712 

713 @classmethod 

714 def normalize_user_agent(cls, user_agent): 

715 """ 

716 Normalize a user agent string to categorize browsers, OS, and device types. 

717 

718 Args: 

719 user_agent: The raw user agent string 

720 

721 Returns: 

722 dict: Containing browser, browser_version, os, device_type, is_bot 

723 """ 

724 if not user_agent: 

725 return { 

726 "browser": "Unknown", 

727 "browser_version": None, 

728 "os": "Unknown", 

729 "os_version": None, 

730 "device_type": "Unknown", 

731 "is_bot": False, 

732 "raw": user_agent, 

733 } 

734 

735 result = { 

736 "browser": "Unknown", 

737 "browser_version": None, 

738 "os": "Unknown", 

739 "os_version": None, 

740 "device_type": "Mobile", # Default to Mobile for Eskola APK 

741 "is_bot": False, 

742 "raw": user_agent, 

743 } 

744 

745 # Special case for Eskola APK (both formats) 

746 eskola_match = re.search(r"tl\.eskola\.eskola_app(?:\.playstore)?-(\d+\.\d+\.\d+)-release(?:/(\w+))?", user_agent) 

747 if eskola_match: 

748 result["browser"] = "Eskola APK" 

749 result["browser_version"] = eskola_match.group(1) 

750 result["os"] = "Android" 

751 # Try to extract device model if present 

752 if eskola_match.group(2): 

753 result["os_version"] = f"Device: {eskola_match.group(2)}" 

754 return result 

755 

756 # Check if it's a bot 

757 for pattern, _ in cls.BOT_PATTERNS: 

758 if re.search(pattern, user_agent, re.IGNORECASE): 

759 result["is_bot"] = True 

760 result["browser"] = "Bot/Crawler" 

761 result["device_type"] = "Bot" 

762 break 

763 

764 # Detect browser and version 

765 for pattern, browser in cls.BROWSER_PATTERNS: 

766 match = re.search(pattern, user_agent) 

767 if match: 

768 result["browser"] = browser 

769 # Get version if available 

770 if len(match.groups()) > 0 and match.group(1).isdigit(): 

771 result["browser_version"] = match.group(1) 

772 break 

773 

774 # Special case for Dalvik (Android) user agents 

775 if "Dalvik" in user_agent: 

776 result["os"] = "Android" 

777 # Try to extract Android version 

778 android_match = re.search(r"Android\s+(\d+(?:\.\d+)*)", user_agent) 

779 if android_match: 

780 result["os_version"] = android_match.group(1) 

781 

782 # Detect OS and version for other cases 

783 if result["os"] == "Unknown": # Only if not already set by Dalvik check 

784 for pattern, os in cls.OS_PATTERNS: 

785 match = re.search(pattern, user_agent) 

786 if match: 

787 result["os"] = os 

788 # Extract version if available 

789 if len(match.groups()) > 0: 

790 result["os_version"] = match.group(1) 

791 # Special case for Windows 10 

792 if os == "Windows 10": 

793 result["os_version"] = "10" 

794 break 

795 

796 # Detect device type (only if not already a bot) 

797 if not result["is_bot"]: 

798 for pattern, device in cls.DEVICE_PATTERNS: 

799 if re.search(pattern, user_agent, re.IGNORECASE): 

800 result["device_type"] = device 

801 break 

802 

803 return result