Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/html.py : 19%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2:mod:`pandas.io.html` is a module containing functionality for dealing with
3HTML IO.
5"""
7from collections import abc
8import numbers
9import os
10import re
12from pandas.compat._optional import import_optional_dependency
13from pandas.errors import AbstractMethodError, EmptyDataError
15from pandas.core.dtypes.common import is_list_like
17from pandas.core.construction import create_series_with_explicit_dtype
19from pandas.io.common import is_url, urlopen, validate_header_arg
20from pandas.io.formats.printing import pprint_thing
21from pandas.io.parsers import TextParser
23_IMPORTS = False
24_HAS_BS4 = False
25_HAS_LXML = False
26_HAS_HTML5LIB = False
29def _importers():
30 # import things we need
31 # but make this done on a first use basis
33 global _IMPORTS
34 if _IMPORTS:
35 return
37 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
38 bs4 = import_optional_dependency("bs4", raise_on_missing=False, on_version="ignore")
39 _HAS_BS4 = bs4 is not None
41 lxml = import_optional_dependency(
42 "lxml.etree", raise_on_missing=False, on_version="ignore"
43 )
44 _HAS_LXML = lxml is not None
46 html5lib = import_optional_dependency(
47 "html5lib", raise_on_missing=False, on_version="ignore"
48 )
49 _HAS_HTML5LIB = html5lib is not None
51 _IMPORTS = True
54#############
55# READ HTML #
56#############
57_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
60def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str:
61 """
62 Replace extra whitespace inside of a string with a single space.
64 Parameters
65 ----------
66 s : str or unicode
67 The string from which to remove extra whitespace.
68 regex : re.Pattern
69 The regular expression to use to remove extra whitespace.
71 Returns
72 -------
73 subd : str or unicode
74 `s` with all extra whitespace replaced with a single space.
75 """
76 return regex.sub(" ", s.strip())
79def _get_skiprows(skiprows):
80 """
81 Get an iterator given an integer, slice or container.
83 Parameters
84 ----------
85 skiprows : int, slice, container
86 The iterator to use to skip rows; can also be a slice.
88 Raises
89 ------
90 TypeError
91 * If `skiprows` is not a slice, integer, or Container
93 Returns
94 -------
95 it : iterable
96 A proper iterator to use to skip rows of a DataFrame.
97 """
98 if isinstance(skiprows, slice):
99 start, step = skiprows.start or 0, skiprows.step or 1
100 return list(range(start, skiprows.stop, step))
101 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
102 return skiprows
103 elif skiprows is None:
104 return 0
105 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
108def _read(obj):
109 """
110 Try to read from a url, file or string.
112 Parameters
113 ----------
114 obj : str, unicode, or file-like
116 Returns
117 -------
118 raw_text : str
119 """
120 if is_url(obj):
121 with urlopen(obj) as url:
122 text = url.read()
123 elif hasattr(obj, "read"):
124 text = obj.read()
125 elif isinstance(obj, (str, bytes)):
126 text = obj
127 try:
128 if os.path.isfile(text):
129 with open(text, "rb") as f:
130 return f.read()
131 except (TypeError, ValueError):
132 pass
133 else:
134 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
135 return text
138class _HtmlFrameParser:
139 """
140 Base class for parsers that parse HTML into DataFrames.
142 Parameters
143 ----------
144 io : str or file-like
145 This can be either a string of raw HTML, a valid URL using the HTTP,
146 FTP, or FILE protocols or a file-like object.
148 match : str or regex
149 The text to match in the document.
151 attrs : dict
152 List of HTML <table> element attributes to match.
154 encoding : str
155 Encoding to be used by parser
157 displayed_only : bool
158 Whether or not items with "display:none" should be ignored
160 .. versionadded:: 0.23.0
162 Attributes
163 ----------
164 io : str or file-like
165 raw HTML, URL, or file-like object
167 match : regex
168 The text to match in the raw HTML
170 attrs : dict-like
171 A dictionary of valid table attributes to use to search for table
172 elements.
174 encoding : str
175 Encoding to be used by parser
177 displayed_only : bool
178 Whether or not items with "display:none" should be ignored
180 .. versionadded:: 0.23.0
182 Notes
183 -----
184 To subclass this class effectively you must override the following methods:
185 * :func:`_build_doc`
186 * :func:`_attr_getter`
187 * :func:`_text_getter`
188 * :func:`_parse_td`
189 * :func:`_parse_thead_tr`
190 * :func:`_parse_tbody_tr`
191 * :func:`_parse_tfoot_tr`
192 * :func:`_parse_tables`
193 * :func:`_equals_tag`
194 See each method's respective documentation for details on their
195 functionality.
196 """
198 def __init__(self, io, match, attrs, encoding, displayed_only):
199 self.io = io
200 self.match = match
201 self.attrs = attrs
202 self.encoding = encoding
203 self.displayed_only = displayed_only
205 def parse_tables(self):
206 """
207 Parse and return all tables from the DOM.
209 Returns
210 -------
211 list of parsed (header, body, footer) tuples from tables.
212 """
213 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
214 return (self._parse_thead_tbody_tfoot(table) for table in tables)
216 def _attr_getter(self, obj, attr):
217 """
218 Return the attribute value of an individual DOM node.
220 Parameters
221 ----------
222 obj : node-like
223 A DOM node.
225 attr : str or unicode
226 The attribute, such as "colspan"
228 Returns
229 -------
230 str or unicode
231 The attribute value.
232 """
233 # Both lxml and BeautifulSoup have the same implementation:
234 return obj.get(attr)
236 def _text_getter(self, obj):
237 """
238 Return the text of an individual DOM node.
240 Parameters
241 ----------
242 obj : node-like
243 A DOM node.
245 Returns
246 -------
247 text : str or unicode
248 The text from an individual DOM node.
249 """
250 raise AbstractMethodError(self)
252 def _parse_td(self, obj):
253 """
254 Return the td elements from a row element.
256 Parameters
257 ----------
258 obj : node-like
259 A DOM <tr> node.
261 Returns
262 -------
263 list of node-like
264 These are the elements of each row, i.e., the columns.
265 """
266 raise AbstractMethodError(self)
268 def _parse_thead_tr(self, table):
269 """
270 Return the list of thead row elements from the parsed table element.
272 Parameters
273 ----------
274 table : a table element that contains zero or more thead elements.
276 Returns
277 -------
278 list of node-like
279 These are the <tr> row elements of a table.
280 """
281 raise AbstractMethodError(self)
283 def _parse_tbody_tr(self, table):
284 """
285 Return the list of tbody row elements from the parsed table element.
287 HTML5 table bodies consist of either 0 or more <tbody> elements (which
288 only contain <tr> elements) or 0 or more <tr> elements. This method
289 checks for both structures.
291 Parameters
292 ----------
293 table : a table element that contains row elements.
295 Returns
296 -------
297 list of node-like
298 These are the <tr> row elements of a table.
299 """
300 raise AbstractMethodError(self)
302 def _parse_tfoot_tr(self, table):
303 """
304 Return the list of tfoot row elements from the parsed table element.
306 Parameters
307 ----------
308 table : a table element that contains row elements.
310 Returns
311 -------
312 list of node-like
313 These are the <tr> row elements of a table.
314 """
315 raise AbstractMethodError(self)
317 def _parse_tables(self, doc, match, attrs):
318 """
319 Return all tables from the parsed DOM.
321 Parameters
322 ----------
323 doc : the DOM from which to parse the table element.
325 match : str or regular expression
326 The text to search for in the DOM tree.
328 attrs : dict
329 A dictionary of table attributes that can be used to disambiguate
330 multiple tables on a page.
332 Raises
333 ------
334 ValueError : `match` does not match any text in the document.
336 Returns
337 -------
338 list of node-like
339 HTML <table> elements to be parsed into raw data.
340 """
341 raise AbstractMethodError(self)
343 def _equals_tag(self, obj, tag):
344 """
345 Return whether an individual DOM node matches a tag
347 Parameters
348 ----------
349 obj : node-like
350 A DOM node.
352 tag : str
353 Tag name to be checked for equality.
355 Returns
356 -------
357 boolean
358 Whether `obj`'s tag name is `tag`
359 """
360 raise AbstractMethodError(self)
362 def _build_doc(self):
363 """
364 Return a tree-like object that can be used to iterate over the DOM.
366 Returns
367 -------
368 node-like
369 The DOM from which to parse the table element.
370 """
371 raise AbstractMethodError(self)
373 def _parse_thead_tbody_tfoot(self, table_html):
374 """
375 Given a table, return parsed header, body, and foot.
377 Parameters
378 ----------
379 table_html : node-like
381 Returns
382 -------
383 tuple of (header, body, footer), each a list of list-of-text rows.
385 Notes
386 -----
387 Header and body are lists-of-lists. Top level list is a list of
388 rows. Each row is a list of str text.
390 Logic: Use <thead>, <tbody>, <tfoot> elements to identify
391 header, body, and footer, otherwise:
392 - Put all rows into body
393 - Move rows from top of body to header only if
394 all elements inside row are <th>
395 - Move rows from bottom of body to footer only if
396 all elements inside row are <th>
397 """
399 header_rows = self._parse_thead_tr(table_html)
400 body_rows = self._parse_tbody_tr(table_html)
401 footer_rows = self._parse_tfoot_tr(table_html)
403 def row_is_all_th(row):
404 return all(self._equals_tag(t, "th") for t in self._parse_td(row))
406 if not header_rows:
407 # The table has no <thead>. Move the top all-<th> rows from
408 # body_rows to header_rows. (This is a common case because many
409 # tables in the wild have no <thead> or <tfoot>
410 while body_rows and row_is_all_th(body_rows[0]):
411 header_rows.append(body_rows.pop(0))
413 header = self._expand_colspan_rowspan(header_rows)
414 body = self._expand_colspan_rowspan(body_rows)
415 footer = self._expand_colspan_rowspan(footer_rows)
417 return header, body, footer
419 def _expand_colspan_rowspan(self, rows):
420 """
421 Given a list of <tr>s, return a list of text rows.
423 Parameters
424 ----------
425 rows : list of node-like
426 List of <tr>s
428 Returns
429 -------
430 list of list
431 Each returned row is a list of str text.
433 Notes
434 -----
435 Any cell with ``rowspan`` or ``colspan`` will have its contents copied
436 to subsequent cells.
437 """
439 all_texts = [] # list of rows, each a list of str
440 remainder = [] # list of (index, text, nrows)
442 for tr in rows:
443 texts = [] # the output for this row
444 next_remainder = []
446 index = 0
447 tds = self._parse_td(tr)
448 for td in tds:
449 # Append texts from previous rows with rowspan>1 that come
450 # before this <td>
451 while remainder and remainder[0][0] <= index:
452 prev_i, prev_text, prev_rowspan = remainder.pop(0)
453 texts.append(prev_text)
454 if prev_rowspan > 1:
455 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
456 index += 1
458 # Append the text from this <td>, colspan times
459 text = _remove_whitespace(self._text_getter(td))
460 rowspan = int(self._attr_getter(td, "rowspan") or 1)
461 colspan = int(self._attr_getter(td, "colspan") or 1)
463 for _ in range(colspan):
464 texts.append(text)
465 if rowspan > 1:
466 next_remainder.append((index, text, rowspan - 1))
467 index += 1
469 # Append texts from previous rows at the final position
470 for prev_i, prev_text, prev_rowspan in remainder:
471 texts.append(prev_text)
472 if prev_rowspan > 1:
473 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
475 all_texts.append(texts)
476 remainder = next_remainder
478 # Append rows that only appear because the previous row had non-1
479 # rowspan
480 while remainder:
481 next_remainder = []
482 texts = []
483 for prev_i, prev_text, prev_rowspan in remainder:
484 texts.append(prev_text)
485 if prev_rowspan > 1:
486 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
487 all_texts.append(texts)
488 remainder = next_remainder
490 return all_texts
492 def _handle_hidden_tables(self, tbl_list, attr_name):
493 """
494 Return list of tables, potentially removing hidden elements
496 Parameters
497 ----------
498 tbl_list : list of node-like
499 Type of list elements will vary depending upon parser used
500 attr_name : str
501 Name of the accessor for retrieving HTML attributes
503 Returns
504 -------
505 list of node-like
506 Return type matches `tbl_list`
507 """
508 if not self.displayed_only:
509 return tbl_list
511 return [
512 x
513 for x in tbl_list
514 if "display:none"
515 not in getattr(x, attr_name).get("style", "").replace(" ", "")
516 ]
519class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
520 """
521 HTML to DataFrame parser that uses BeautifulSoup under the hood.
523 See Also
524 --------
525 pandas.io.html._HtmlFrameParser
526 pandas.io.html._LxmlFrameParser
528 Notes
529 -----
530 Documentation strings for this class are in the base class
531 :class:`pandas.io.html._HtmlFrameParser`.
532 """
534 def __init__(self, *args, **kwargs):
535 super().__init__(*args, **kwargs)
536 from bs4 import SoupStrainer
538 self._strainer = SoupStrainer("table")
540 def _parse_tables(self, doc, match, attrs):
541 element_name = self._strainer.name
542 tables = doc.find_all(element_name, attrs=attrs)
544 if not tables:
545 raise ValueError("No tables found")
547 result = []
548 unique_tables = set()
549 tables = self._handle_hidden_tables(tables, "attrs")
551 for table in tables:
552 if self.displayed_only:
553 for elem in table.find_all(style=re.compile(r"display:\s*none")):
554 elem.decompose()
556 if table not in unique_tables and table.find(text=match) is not None:
557 result.append(table)
558 unique_tables.add(table)
560 if not result:
561 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
562 return result
564 def _text_getter(self, obj):
565 return obj.text
567 def _equals_tag(self, obj, tag):
568 return obj.name == tag
570 def _parse_td(self, row):
571 return row.find_all(("td", "th"), recursive=False)
573 def _parse_thead_tr(self, table):
574 return table.select("thead tr")
576 def _parse_tbody_tr(self, table):
577 from_tbody = table.select("tbody tr")
578 from_root = table.find_all("tr", recursive=False)
579 # HTML spec: at most one of these lists has content
580 return from_tbody + from_root
582 def _parse_tfoot_tr(self, table):
583 return table.select("tfoot tr")
585 def _setup_build_doc(self):
586 raw_text = _read(self.io)
587 if not raw_text:
588 raise ValueError(f"No text parsed from document: {self.io}")
589 return raw_text
591 def _build_doc(self):
592 from bs4 import BeautifulSoup
594 bdoc = self._setup_build_doc()
595 if isinstance(bdoc, bytes) and self.encoding is not None:
596 udoc = bdoc.decode(self.encoding)
597 from_encoding = None
598 else:
599 udoc = bdoc
600 from_encoding = self.encoding
601 return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
604def _build_xpath_expr(attrs) -> str:
605 """Build an xpath expression to simulate bs4's ability to pass in kwargs to
606 search for attributes when using the lxml parser.
608 Parameters
609 ----------
610 attrs : dict
611 A dict of HTML attributes. These are NOT checked for validity.
613 Returns
614 -------
615 expr : unicode
616 An XPath expression that checks for the given HTML attributes.
617 """
618 # give class attribute as class_ because class is a python keyword
619 if "class_" in attrs:
620 attrs["class"] = attrs.pop("class_")
622 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
623 return f"[{s}]"
626_re_namespace = {"re": "http://exslt.org/regular-expressions"}
627_valid_schemes = "http", "file", "ftp"
630class _LxmlFrameParser(_HtmlFrameParser):
631 """
632 HTML to DataFrame parser that uses lxml under the hood.
634 Warning
635 -------
636 This parser can only handle HTTP, FTP, and FILE urls.
638 See Also
639 --------
640 _HtmlFrameParser
641 _BeautifulSoupLxmlFrameParser
643 Notes
644 -----
645 Documentation strings for this class are in the base class
646 :class:`_HtmlFrameParser`.
647 """
649 def __init__(self, *args, **kwargs):
650 super().__init__(*args, **kwargs)
652 def _text_getter(self, obj):
653 return obj.text_content()
655 def _parse_td(self, row):
656 # Look for direct children only: the "row" element here may be a
657 # <thead> or <tfoot> (see _parse_thead_tr).
658 return row.xpath("./td|./th")
660 def _parse_tables(self, doc, match, kwargs):
661 pattern = match.pattern
663 # 1. check all descendants for the given pattern and only search tables
664 # 2. go up the tree until we find a table
665 xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table"
667 # if any table attributes were given build an xpath expression to
668 # search for them
669 if kwargs:
670 xpath_expr += _build_xpath_expr(kwargs)
672 tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
674 tables = self._handle_hidden_tables(tables, "attrib")
675 if self.displayed_only:
676 for table in tables:
677 # lxml utilizes XPATH 1.0 which does not have regex
678 # support. As a result, we find all elements with a style
679 # attribute and iterate them to check for display:none
680 for elem in table.xpath(".//*[@style]"):
681 if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
682 elem.getparent().remove(elem)
684 if not tables:
685 raise ValueError(f"No tables found matching regex {repr(pattern)}")
686 return tables
688 def _equals_tag(self, obj, tag):
689 return obj.tag == tag
691 def _build_doc(self):
692 """
693 Raises
694 ------
695 ValueError
696 * If a URL that lxml cannot parse is passed.
698 Exception
699 * Any other ``Exception`` thrown. For example, trying to parse a
700 URL that is syntactically correct on a machine with no internet
701 connection will fail.
703 See Also
704 --------
705 pandas.io.html._HtmlFrameParser._build_doc
706 """
707 from lxml.html import parse, fromstring, HTMLParser
708 from lxml.etree import XMLSyntaxError
710 parser = HTMLParser(recover=True, encoding=self.encoding)
712 try:
713 if is_url(self.io):
714 with urlopen(self.io) as f:
715 r = parse(f, parser=parser)
716 else:
717 # try to parse the input in the simplest way
718 r = parse(self.io, parser=parser)
719 try:
720 r = r.getroot()
721 except AttributeError:
722 pass
723 except (UnicodeDecodeError, IOError) as e:
724 # if the input is a blob of html goop
725 if not is_url(self.io):
726 r = fromstring(self.io, parser=parser)
728 try:
729 r = r.getroot()
730 except AttributeError:
731 pass
732 else:
733 raise e
734 else:
735 if not hasattr(r, "text_content"):
736 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
737 return r
739 def _parse_thead_tr(self, table):
740 rows = []
742 for thead in table.xpath(".//thead"):
743 rows.extend(thead.xpath("./tr"))
745 # HACK: lxml does not clean up the clearly-erroneous
746 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
747 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
748 # children as though it's a <tr>.
749 #
750 # Better solution would be to use html5lib.
751 elements_at_root = thead.xpath("./td|./th")
752 if elements_at_root:
753 rows.append(thead)
755 return rows
757 def _parse_tbody_tr(self, table):
758 from_tbody = table.xpath(".//tbody//tr")
759 from_root = table.xpath("./tr")
760 # HTML spec: at most one of these lists has content
761 return from_tbody + from_root
763 def _parse_tfoot_tr(self, table):
764 return table.xpath(".//tfoot//tr")
767def _expand_elements(body):
768 data = [len(elem) for elem in body]
769 lens = create_series_with_explicit_dtype(data, dtype_if_empty=object)
770 lens_max = lens.max()
771 not_max = lens[lens != lens_max]
773 empty = [""]
774 for ind, length in not_max.items():
775 body[ind] += empty * (lens_max - length)
778def _data_to_frame(**kwargs):
779 head, body, foot = kwargs.pop("data")
780 header = kwargs.pop("header")
781 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
782 if head:
783 body = head + body
785 # Infer header when there is a <thead> or top <th>-only rows
786 if header is None:
787 if len(head) == 1:
788 header = 0
789 else:
790 # ignore all-empty-text rows
791 header = [i for i, row in enumerate(head) if any(text for text in row)]
793 if foot:
794 body += foot
796 # fill out elements of body that are "ragged"
797 _expand_elements(body)
798 tp = TextParser(body, header=header, **kwargs)
799 df = tp.read()
800 return df
803_valid_parsers = {
804 "lxml": _LxmlFrameParser,
805 None: _LxmlFrameParser,
806 "html5lib": _BeautifulSoupHtml5LibFrameParser,
807 "bs4": _BeautifulSoupHtml5LibFrameParser,
808}
811def _parser_dispatch(flavor):
812 """
813 Choose the parser based on the input flavor.
815 Parameters
816 ----------
817 flavor : str
818 The type of parser to use. This must be a valid backend.
820 Returns
821 -------
822 cls : _HtmlFrameParser subclass
823 The parser class based on the requested input flavor.
825 Raises
826 ------
827 ValueError
828 * If `flavor` is not a valid backend.
829 ImportError
830 * If you do not have the requested `flavor`
831 """
832 valid_parsers = list(_valid_parsers.keys())
833 if flavor not in valid_parsers:
834 raise ValueError(
835 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
836 )
838 if flavor in ("bs4", "html5lib"):
839 if not _HAS_HTML5LIB:
840 raise ImportError("html5lib not found, please install it")
841 if not _HAS_BS4:
842 raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
843 # Although we call this above, we want to raise here right before use.
844 bs4 = import_optional_dependency("bs4") # noqa:F841
846 else:
847 if not _HAS_LXML:
848 raise ImportError("lxml not found, please install it")
849 return _valid_parsers[flavor]
852def _print_as_set(s) -> str:
853 arg = ", ".join(pprint_thing(el) for el in s)
854 return f"{{{arg}}}"
857def _validate_flavor(flavor):
858 if flavor is None:
859 flavor = "lxml", "bs4"
860 elif isinstance(flavor, str):
861 flavor = (flavor,)
862 elif isinstance(flavor, abc.Iterable):
863 if not all(isinstance(flav, str) for flav in flavor):
864 raise TypeError(
865 f"Object of type {repr(type(flavor).__name__)} "
866 f"is not an iterable of strings"
867 )
868 else:
869 msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
870 msg += " is not a valid flavor"
871 raise ValueError(msg)
873 flavor = tuple(flavor)
874 valid_flavors = set(_valid_parsers)
875 flavor_set = set(flavor)
877 if not flavor_set & valid_flavors:
878 raise ValueError(
879 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
880 f"flavors are {_print_as_set(valid_flavors)}"
881 )
882 return flavor
885def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
886 flavor = _validate_flavor(flavor)
887 compiled_match = re.compile(match) # you can pass a compiled regex here
889 retained = None
890 for flav in flavor:
891 parser = _parser_dispatch(flav)
892 p = parser(io, compiled_match, attrs, encoding, displayed_only)
894 try:
895 tables = p.parse_tables()
896 except ValueError as caught:
897 # if `io` is an io-like object, check if it's seekable
898 # and try to rewind it before trying the next parser
899 if hasattr(io, "seekable") and io.seekable():
900 io.seek(0)
901 elif hasattr(io, "seekable") and not io.seekable():
902 # if we couldn't rewind it, let the user know
903 raise ValueError(
904 f"The flavor {flav} failed to parse your input. "
905 "Since you passed a non-rewindable file "
906 "object, we can't rewind it to try "
907 "another parser. Try read_html() with a "
908 "different flavor."
909 )
911 retained = caught
912 else:
913 break
914 else:
915 raise retained
917 ret = []
918 for table in tables:
919 try:
920 ret.append(_data_to_frame(data=table, **kwargs))
921 except EmptyDataError: # empty table
922 continue
923 return ret
926def read_html(
927 io,
928 match=".+",
929 flavor=None,
930 header=None,
931 index_col=None,
932 skiprows=None,
933 attrs=None,
934 parse_dates=False,
935 thousands=",",
936 encoding=None,
937 decimal=".",
938 converters=None,
939 na_values=None,
940 keep_default_na=True,
941 displayed_only=True,
942):
943 r"""
944 Read HTML tables into a ``list`` of ``DataFrame`` objects.
946 Parameters
947 ----------
948 io : str, path object or file-like object
949 A URL, a file-like object, or a raw string containing HTML. Note that
950 lxml only accepts the http, ftp and file url protocols. If you have a
951 URL that starts with ``'https'`` you might try removing the ``'s'``.
953 match : str or compiled regular expression, optional
954 The set of tables containing text matching this regex or string will be
955 returned. Unless the HTML is extremely simple you will probably need to
956 pass a non-empty string here. Defaults to '.+' (match any non-empty
957 string). The default value will return all tables contained on a page.
958 This value is converted to a regular expression so that there is
959 consistent behavior between Beautiful Soup and lxml.
961 flavor : str or None
962 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
963 each other, they are both there for backwards compatibility. The
964 default of ``None`` tries to use ``lxml`` to parse and if that fails it
965 falls back on ``bs4`` + ``html5lib``.
967 header : int or list-like or None, optional
968 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
969 make the columns headers.
971 index_col : int or list-like or None, optional
972 The column (or list of columns) to use to create the index.
974 skiprows : int or list-like or slice or None, optional
975 Number of rows to skip after parsing the column integer. 0-based. If a
976 sequence of integers or a slice is given, will skip the rows indexed by
977 that sequence. Note that a single element sequence means 'skip the nth
978 row' whereas an integer means 'skip n rows'.
980 attrs : dict or None, optional
981 This is a dictionary of attributes that you can pass to use to identify
982 the table in the HTML. These are not checked for validity before being
983 passed to lxml or Beautiful Soup. However, these attributes must be
984 valid HTML table attributes to work correctly. For example, ::
986 attrs = {'id': 'table'}
988 is a valid attribute dictionary because the 'id' HTML tag attribute is
989 a valid HTML attribute for *any* HTML tag as per `this document
990 <http://www.w3.org/TR/html-markup/global-attributes.html>`__. ::
992 attrs = {'asdf': 'table'}
994 is *not* a valid attribute dictionary because 'asdf' is not a valid
995 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
996 table attributes can be found `here
997 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
998 working draft of the HTML 5 spec can be found `here
999 <http://www.w3.org/TR/html-markup/table.html>`__. It contains the
1000 latest information on table attributes for the modern web.
1002 parse_dates : bool, optional
1003 See :func:`~read_csv` for more details.
1005 thousands : str, optional
1006 Separator to use to parse thousands. Defaults to ``','``.
1008 encoding : str or None, optional
1009 The encoding used to decode the web page. Defaults to ``None``.``None``
1010 preserves the previous encoding behavior, which depends on the
1011 underlying parser library (e.g., the parser library will try to use
1012 the encoding provided by the document).
1014 decimal : str, default '.'
1015 Character to recognize as decimal point (e.g. use ',' for European
1016 data).
1018 converters : dict, default None
1019 Dict of functions for converting values in certain columns. Keys can
1020 either be integers or column labels, values are functions that take one
1021 input argument, the cell (not column) content, and return the
1022 transformed content.
1024 na_values : iterable, default None
1025 Custom NA values.
1027 keep_default_na : bool, default True
1028 If na_values are specified and keep_default_na is False the default NaN
1029 values are overridden, otherwise they're appended to.
1031 displayed_only : bool, default True
1032 Whether elements with "display: none" should be parsed.
1034 Returns
1035 -------
1036 dfs
1037 A list of DataFrames.
1039 See Also
1040 --------
1041 read_csv
1043 Notes
1044 -----
1045 Before using this function you should read the :ref:`gotchas about the
1046 HTML parsing libraries <io.html.gotchas>`.
1048 Expect to do some cleanup after you call this function. For example, you
1049 might need to manually assign column names if the column names are
1050 converted to NaN when you pass the `header=0` argument. We try to assume as
1051 little as possible about the structure of the table and push the
1052 idiosyncrasies of the HTML contained in the table to the user.
1054 This function searches for ``<table>`` elements and only for ``<tr>``
1055 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
1056 element in the table. ``<td>`` stands for "table data". This function
1057 attempts to properly handle ``colspan`` and ``rowspan`` attributes.
1058 If the function has a ``<thead>`` argument, it is used to construct
1059 the header, otherwise the function attempts to find the header within
1060 the body (by putting rows with only ``<th>`` elements into the header).
1062 .. versionadded:: 0.21.0
1064 Similar to :func:`~read_csv` the `header` argument is applied
1065 **after** `skiprows` is applied.
1067 This function will *always* return a list of :class:`DataFrame` *or*
1068 it will fail, e.g., it will *not* return an empty list.
1070 Examples
1071 --------
1072 See the :ref:`read_html documentation in the IO section of the docs
1073 <io.read_html>` for some examples of reading in HTML tables.
1074 """
1075 _importers()
1077 # Type check here. We don't want to parse only to fail because of an
1078 # invalid value of an integer skiprows.
1079 if isinstance(skiprows, numbers.Integral) and skiprows < 0:
1080 raise ValueError(
1081 "cannot skip rows starting from the end of the "
1082 "data (you passed a negative value)"
1083 )
1084 validate_header_arg(header)
1085 return _parse(
1086 flavor=flavor,
1087 io=io,
1088 match=match,
1089 header=header,
1090 index_col=index_col,
1091 skiprows=skiprows,
1092 parse_dates=parse_dates,
1093 thousands=thousands,
1094 attrs=attrs,
1095 encoding=encoding,
1096 decimal=decimal,
1097 converters=converters,
1098 na_values=na_values,
1099 keep_default_na=keep_default_na,
1100 displayed_only=displayed_only,
1101 )