/
OS-Worldbec7129
import builtins
#import datetime
import functools
import itertools
import logging
import operator
import os
import re
import zipfile
#import pandas as pd
from typing import Any, TypeVar, Union, Iterable, Optional, Callable
from typing import Dict, List, Set, Match, Tuple, Pattern
from urllib.parse import urlparse, urlunparse, ParseResult
import formulas
import lxml.cssselect
import lxml.etree
import xmltodict
from lxml.etree import _Element
from openpyxl import Workbook
from openpyxl.cell.cell import Cell, MergedCell
from openpyxl.chart._chart import ChartBase
from openpyxl.formatting.formatting import ConditionalFormattingList
from openpyxl.pivot.cache import CacheSource as PivotCacheSource
from openpyxl.pivot.table import TableDefinition as PivotTableDefinition
from openpyxl.styles.differential import DifferentialStyle
from openpyxl.utils import coordinate_to_tuple, get_column_letter
from openpyxl.worksheet.cell_range import MultiCellRange, CellRange
from openpyxl.worksheet.dimensions import DimensionHolder
from openpyxl.worksheet.filters import AutoFilter, SortState
from openpyxl.worksheet.worksheet import Worksheet
import tldextract
V = TypeVar("Value")
logger = logging.getLogger("desktopenv.metrics.utils")
_xlsx_namespaces = [
("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
("xm", "http://schemas.microsoft.com/office/excel/2006/main")
]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
_sheet_name_selector = lxml.cssselect.CSSSelector("oo|sheets>oo|sheet", namespaces=_xlsx_ns_mapping)
_sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping)
def load_sparklines(xlsx_file: str, sheet_name: str) -> Dict[str, str]:
# function load_sparklines {{{ #
"""
Args:
xlsx_file (str): path to xlsx
sheet_name (str): sheet name
Returns:
List[Dict[str, str]]: sparkline definitions in form of
{
"F3": "Sheet1!C3:E3"
}
"""
# read xlsx
try:
with zipfile.ZipFile(xlsx_file, "r") as z_f:
with z_f.open("xl/workbook.xml") as f:
workbook_database: _Element = lxml.etree.fromstring(f.read())
sheets: List[_Element] = _sheet_name_selector(workbook_database)
sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets}
with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f:
sheet: _Element = lxml.etree.fromstring(f.read())
sparklines: List[_Element] = _sparklines_selector(sheet)
except zipfile.BadZipFile:
return {}
sparklines_dict: Dict[str, str] = {}
for sp_l in sparklines:
sparkline_xml: str = lxml.etree.tostring(sp_l, encoding="unicode")
sparkline: Dict[str, Dict[str, str]] = xmltodict.parse(sparkline_xml
, process_namespaces=True
, namespaces=_xlsx_ns_imapping
)
sparklines_dict[sparkline["x14:sparkline"]["xm:sqref"]] = sparkline["x14:sparkline"]["xm:f"]
return sparklines_dict
# }}} function load_sparklines #
# Available Chart Properties:
# title: str
# anchor: ["oneCell" | "twoCell" | "absolute", col0, row0, col1, row1]
# legend: "b" | "tr" | "l" | "r" | "t"
# width: number
# height: number
# type: "scatterChart" | "lineChart" | "barChart"
# direction: "bar" (hori) | "col" (vert)
# xtitle, ytitle, ztitle: str
def load_charts(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
# function load_charts {{{ #
"""
Args:
xlsx_file (Workbook): concerned excel book
sheet_name (str): sheet name
options (Dict[str, List[str]]): dict like {"chart_props": list of str}
giving the concerned chart properties
Returns:
Dict[str, Any]: information of charts, dict like
{
<str representing data source>: {
<str as property>: anything
}
}
"""
# workbook: Workbook = openpyxl.load_workbook(filename=xlsx_file)
try:
worksheet: Worksheet = xlsx_file[sheet_name]
except KeyError:
return {}
charts: List[ChartBase] = worksheet._charts
chart_set: Dict[str, Any] = {}
chart_props: Set[str] = set(options["chart_props"]) if "chart_props" in options else set()
for ch in charts:
series: List[str] = []
for ser in ch.series:
if hasattr(ser.val, "numRef") and hasattr(ser.val.numRef, "f"):
value_str: str = ser.val.numRef.f
elif hasattr(ser.val, "strRef") and hasattr(ser.val.strRef, "f"):
value_str: str = ser.val.strRef.f
else:
value_str: str = ""
if hasattr(ser.cat, "numRef") and hasattr(ser.cat.numRef, "f"):
categ_str: str = ser.cat.numRef.f
elif hasattr(ser.cat, "strRef") and hasattr(ser.cat.strRef, "f"):
categ_str: str = ser.cat.strRef.f
else:
categ_str: str = ""
series.append("{:},{:}".format(value_str, categ_str))
series: str = ";".join(series)
# TODO: maybe more aspects, like chart type
info: Dict[str, Any] = {}
if "title" in chart_props:
try:
info["title"] = ch.title.tx.rich.p[0].r[0].t
except:
info["title"] = None
if "legend" in chart_props:
info["legend"] = ch.legend.position if ch.legend is not None else None
if "anchor" in chart_props:
info["anchor"] = [ch.anchor.editAs
, ch.anchor._from.col, ch.anchor.to.row
, ch.anchor.to.col, ch.anchor.to.row
]
if "width" in chart_props:
info["width"] = ch.width
if "height" in chart_props:
info["height"] = ch.height
if "type" in chart_props:
info["type"] = ch.tagname
if "direction" in chart_props:
info["direction"] = ch.barDir
if "xtitle" in chart_props:
try:
info["xtitle"] = ch.x_axis.title.tx.rich.p[0].r[0].t
except:
info["xtitle"] = None
if "ytitle" in chart_props:
try:
info["ytitle"] = ch.y_axis.title.tx.rich.p[0].r[0].t
except:
info["ytitle"] = None
if "ztitle" in chart_props:
try:
info["ztitle"] = ch.z_axis.title.tx.rich.p[0].r[0].t
except:
info["ztitle"] = None
chart_set[series] = info
logger.debug(".[%s].charts: %s", sheet_name, repr(chart_set))
return chart_set
# }}} function load_charts #
# Available Pivot Properties:
# name: str
# show_total, show_empty_row, show_empty_col, show_headers: bool
# location: str
# selection: if the concrete item selection should be checked, a list of set of tuple like (bool, index) will be returned; list will be returned instead of set if "ordered" is specified
# filter: if the filter fields should be checked; fields indices will be return in `filter_fields` item
# col_fields: indices
# row_fields: indices
# data_fields: list of str representations. the str representation is like "index;name;subtotal_type;show_data_as"; name is optional and is only returned when `data_fields_name` is specified in `pivot_props`
def load_pivot_tables(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
# function load_pivot_tables {{{ #
"""
Args:
xlsx_file (Workbook): concerned excel book
sheet_name (str): sheet name
options (Dict[str, List[str]]): dict like {"pivot_props": list of str}
giving the concerned pivot properties
Returns:
Dict[str, Any]: information of pivot tables, dict like
{
<str representing data source>: {
<str as property>: anything
}
}
"""
try:
worksheet: Worksheet = xlsx_file[sheet_name]
except KeyError:
return {}
pivots: List[PivotTableDefinition] = worksheet._pivots
pivot_set: Dict[str, Any] = {}
pivot_props: Set[str] = set(options.get("pivot_props", []))
for pvt in pivots:
raw_selection: List[List[tuple[Optional[bool], int]]] = \
[[(itm.h, itm.x) for itm in f.items if itm.x is not None] \
for f in pvt.pivotFields
]
raw__selection: List[List[tuple[Optional[bool], int]]] = list(
itertools.dropwhile(lambda r: len(r) == 0, raw_selection))
left_bias = len(raw_selection) - len(raw__selection)
selection: List[List[tuple[Optional[bool], int]]] = list(
(itertools.dropwhile(lambda r: len(r) == 0, reversed(raw__selection))))[::-1]
right_bias = len(raw__selection) - len(selection)
cache_source: PivotCacheSource = pvt.cache.cacheSource
cell_range1: str
cell_range2: str
cell_range1, cell_range2 = cache_source.worksheetSource.ref.split(":")
cell_range1: Tuple[int, int] = coordinate_to_tuple(cell_range1)
cell_range1 = (cell_range1[0], cell_range1[1] + left_bias)
cell_range2: Tuple[int, int] = coordinate_to_tuple(cell_range2)
cell_range2 = (cell_range2[0], cell_range2[1] - right_bias)
source: str = "{:};{:}:{:};{:}".format(cache_source.type, cell_range1, cell_range2,
cache_source.worksheetSource.sheet)
info: Dict[str, Any] = {}
if "name" in pivot_props:
info["name"] = pvt.name
if "show_total" in pivot_props:
info["show_total"] = pvt.visualTotals
if "show_empty_row" in pivot_props:
info["show_empty_row"] = pvt.showEmptyRow
if "show_empty_col" in pivot_props:
info["show_empty_col"] = pvt.showEmptyCol
if "show_headers" in pivot_props:
info["show_headers"] = pvt.showHeaders
if "location" in pivot_props:
info["location"] = pvt.location
if "filter" in pivot_props or "selection" in pivot_props:
info["selection"] = selection if "ordered" in pivot_props else list(set(r) for r in selection)
if "filter" in pivot_props:
info["filter_fields"] = set(f.fld for f in pvt.pageFields)
if "col_fields" in pivot_props:
info["col_fields"] = [f.x - left_bias for f in pvt.colFields]
if "row_fields" in pivot_props:
info["row_fields"] = [f.x - left_bias for f in pvt.rowFields]
if "data_fields" in pivot_props:
info["data_fields"] = [
"{:d};{:};{:};{:}".format(f.fld - left_bias, f.name if "data_fields_name" in pivot_props else ""
, f.subtotal, f.showDataAs
) \
for f in pvt.dataFields
]
pivot_set[source] = info
logger.debug(".[%s].pivots: %s", sheet_name, repr(pivot_set))
return pivot_set
# }}} function load_pivot_tables #
_shared_str_selector = lxml.cssselect.CSSSelector("oo|sst>oo|si", namespaces=_xlsx_ns_mapping)
_shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx_ns_mapping)
def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
# read_cell_value {{{ #
logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
# Check if file exists
if not os.path.exists(xlsx_file):
logger.error(f"Excel file not found: {xlsx_file}")
return None
try:
with zipfile.ZipFile(xlsx_file, "r") as z_f:
try:
with z_f.open("xl/sharedStrings.xml") as f:
shared_str_xml: _Element = lxml.etree.fromstring(f.read())
str_elements: List[_Element] = _shared_str_selector(shared_str_xml)
shared_strs: List[str] = [ "".join(t.text for t in _shared_str_value_selector(elm))\
for elm in str_elements
]
except:
#logger.exception("Read shared strings error: %s", xlsx_file)
logger.debug("Read shared strings error: %s", xlsx_file)
shared_strs: List[str] = []
with z_f.open("xl/workbook.xml") as f:
workbook_database: _Element = lxml.etree.fromstring(f.read())
sheets: List[_Element] = _sheet_name_selector(workbook_database)
sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets}
with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f:
sheet: _Element = lxml.etree.fromstring(f.read())
cells: List[_Element] = \
lxml.cssselect.CSSSelector('oo|row>oo|c[r="{:}"]'.format(coordinate)
, namespaces=_xlsx_ns_mapping
)(sheet)
if len(cells) == 0:
logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
return None
cell: _Element = cells[0]
except zipfile.BadZipFile as e:
logger.error(f"Bad zip file {xlsx_file}: {e}")
return None
except KeyError as e:
logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
return None
except Exception as e:
logger.error(f"Error reading {xlsx_file}: {e}")
return None
cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
, process_namespaces=True
, namespaces=_xlsx_ns_imapping
)
logger.debug("%s.shared_strings: %s", xlsx_file, repr(shared_strs))
logger.debug("%s.%s[%s]: %s", xlsx_file, sheet_name, coordinate, repr(cell))
try:
if "@t" not in cell["c"] or cell["c"]["@t"] == "n":
return float(cell["c"]["v"])
if cell["c"]["@t"] == "s":
return shared_strs[int(cell["c"]["v"])]
if cell["c"]["@t"] == "str":
return cell["c"]["v"]
if cell["c"]["@t"] == "inlineStr":
return cell["c"]["is"]["t"]
if cell["c"]["@t"] == "e":
return cell["c"]["v"]
except (KeyError, ValueError):
return None
# }}} read_cell_value #
# Supported Styles:
# number_format
# font_name - str
# font_family - float
# font_color - in aRGB, e.g., FF000000 is black
# font_bold - bool
# font_italic - bool
# font_underline - "single" | "double" | "singleAccounting" | "doubleAccounting"
# font_size - float
# fill_type - "patternFill" | "gradientFill"
# bgcolor - in aRGB, e.g., FFFF0000 is red; This property seems to be ambiguous with fgcolor in xlsx, strange
# fgcolor - in aRGB, e.g., FF00FFFF is yellow # Deprecated
# hyperlink - str
# merge - bool, if the cell is in a merged range and is not the first cell in the merged range
def _read_cell_style(style_name: str, cell: Union[Cell, MergedCell], diff_style: Optional[DifferentialStyle] = None) -> Any:
if style_name == "number_format":
return (cell.number_format if diff_style is None else diff_style.numFmt.formatCode) \
if cell.value is not None and cell.data_type == "n" else None
elif style_name == "font_name":
return (diff_style or cell).font.name if cell.value is not None else None
elif style_name == "font_family":
return (diff_style or cell).font.family if cell.value is not None else None
elif style_name == "font_color":
return (diff_style or cell).font.color.rgb if cell.value is not None else None
elif style_name == "font_bold":
return (diff_style or cell).font.bold if cell.value is not None else None
elif style_name == "font_italic":
return (diff_style or cell).font.italic if cell.value is not None else None
elif style_name == "font_underline":
return (diff_style or cell).font.underline if cell.value is not None else None
elif style_name == "font_size":
return (diff_style or cell).font.size if cell.value is not None else None
elif style_name == "fill_type":
try:
return (diff_style or cell).fill.tagname
except:
return None
elif style_name == "bgcolor" or style_name == "fgcolor":
try:
#return (diff_style or cell).fill.bgColor.rgb
if diff_style is not None:
return diff_style.fill.bgColor.rgb
else:
return cell.fill.fgColor.rgb
except:
return None
#elif style_name == "fgcolor":
#try:
#return (diff_style or cell).fill.fgColor.rgb
#except:
#return None
elif style_name == "hyperlink":
return cell.hyperlink or "" if cell.value is not None else None
elif style_name == "merge":
return isinstance(cell, MergedCell)
else:
raise NotImplementedError("Unsupported Style: {:}".format(style_name))
def _process_xlsx_cf_operator(operator: str, value: Any, ref: List[Any]) -> bool:
# function _process_xlsx_cf_operator {{{ #
# "containsText", "lessThanOrEqual", "notBetween", "lessThan", "notContains", "beginsWith", "equal", "greaterThanOrEqual", "between", "endsWith", "notEqual", "greaterThan"
try:
if operator=="lessThanOrEqual":
result: bool = value<=ref[0]
elif operator=="lessThan":
result: bool = value<ref[0]
elif operator=="equal":
result: bool = value==ref[0]
elif operator=="greaterThanOrEqual":
result: bool = value>=ref[0]
elif operator=="notEqual":
result: bool = value!=ref[0]
elif operator=="greaterThan":
result: bool = value>ref[0]
elif operator=="between":
small_one: float
large_one: float
small_one, large_one = min(ref), max(ref)
result: bool = value>=small_one and value<=large_one
elif operator=="notBetween":
small_one: float
large_one: float
small_one, large_one = min(ref), max(ref)
result: bool = value<small_one or value>large_one
else:
#raise NotImplementedError("Not Implemented CondFormat Operator: {:}".format(operator))
logger.exception("Not Implemented CondFormat Operator: {:}".format(operator))
return result
except TypeError:
logger.exception("Unmatched type of %s and %s. Auto to False", repr(value), repr(ref))
return False
except IndexError:
logger.exception("ref array doesn't have enough elements. Auto to False: %s", repr(ref))
return False
# }}} function _process_xlsx_cf_operator #
_absolute_range_pattern: Pattern[str] = re.compile(r"""\$(?P<col1>[A-Z]{1,3})\$(?P<row1>\d+) # coord1
(?::
\$(?P<col2>[A-Z]{1,3})\$(?P<row2>\d+) # coord2
)?
"""
, re.X
)
def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **options) -> Dict[str, List[Any]]:
# function load_xlsx_styles {{{ #
"""
Args:
xlsx_file (Workbook): concerned excel book
sheet_name (str): sheet name
book_name (str): book name
options (Dict[str, List[str]): dick like {"props": list of str} giving
the concerned styles
Returns:
Dict[str, List[Any]]: dict like
{
<str as cell coordinates>: list of anything indicating concerned
property values
}
"""
try:
worksheet: Worksheet = xlsx_file[sheet_name]
except KeyError:
return {}
style_dict: Dict[str, List[Any]] = {}
concerned_styles: List[str] = options.get("props", [])
# Handles Cell Styles
for col in worksheet.iter_cols():
for c in col:
style_list: List[Any] = []
for st in concerned_styles:
style_list.append(_read_cell_style(st, c))
style_dict[c.coordinate] = style_list
# Handles Conditional Formatting
conditional_formattings: ConditionalFormattingList = worksheet.conditional_formatting
formula_parser = formulas.Parser()
for fmt in conditional_formattings:
for r in fmt.rules:
active_cells: List[Cell] = []
# Process CF Formulae {{{ #
formulae: List[Callable[[Any], Any]] = []
argument_lists: List[List[Any]] = []
has_error = False
for fml in r.formula:
try:
formula_func: Callable[[Any], Any] =\
formula_parser.ast("=" + fml)[1].compile()
logger.debug("CondFormat rule formula: %s", fml)
except:
logger.exception("Formula parsing error: %s. Skipping.", repr(fml))
has_error = True
break
arguments: List[Any] = []
absolute_range_match: List[Tuple[str, str, str, str]] = _absolute_range_pattern.findall(fml)
for m in absolute_range_match:
logger.debug("Absolute ranges: %s", repr(m))
if m[2] is None and m[3] is None:
arguments.append(read_cell_value(book_name, sheet_name, coordinate="{:}{:}".format(m[0], m[1])))
else:
arguments.append([read_cell_value(book_name, sheet_name
, coordinate="{:}{:}".format(get_column_letter(c[1])
, c[0]
)
) \
for c in CellRange("{:}{:}:{:}{:}".format(m[0], m[1], m[2], m[3])).cells \
]
)
logger.debug("Absolute range arguments: %s", repr(arguments))
formulae.append(formula_func)
argument_lists.append(arguments)
if has_error:
continue
# }}} Process CF Formulae #
# Process Condition Accroding to Type {{{ #
if r.type in { "expression"
, "containsText", "notContainsText"
, "endsWith", "beginsWith"
, "containsErrors", "notContainsErrors"
}:
condition: Callable[[Any], bool] = formulae[0]
arguments: List[Any] = argument_lists[0]
is_active: Callable[[Any], bool] = lambda v: condition(v, *arguments)
elif r.type == "cellIs":
operator: str = r.operator
try:
references: List[Any] = [fml() for fml in formulae]
except:
logger.exception("Error occurs while calculating reference values for cellIs condition formatting.")
continue
is_active: Callable[[Any], bool] =\
lambda v: _process_xlsx_cf_operator(operator, v, references)
else:
#raise NotImplementedError("Not Implemented Condition Type: {:}".format(r.type))
# e.g., type=top10 (rank=number, percent=bool, bottom=bool)
# type=aboveAverage (equalAverage=bool, aboveAverage=bool)
# type=duplicateValues / type=uniqueValues
logger.exception("Not Implemented Condition Type: {:}".format(r.type))
# }}} Process Condition Accroding to Type #
# Test Each Cell {{{ #
nb_contiguous_nothings = 0
for rge in fmt.cells:
for c in rge.cells:
cell: Cell = worksheet.cell(row=c[0], column=c[1])
cell_value = read_cell_value(book_name, sheet_name
, coordinate="{:}{:d}".format(get_column_letter(c[1])
, c[0]
)
)
if cell_value is None:
nb_contiguous_nothings += 1
if nb_contiguous_nothings>50:
break
continue
else:
try:
satisfies_condition: bool = is_active(cell_value)
except:
logger.exception("Error in formula calculation with cell value %d", repr(cell_value))
satisfies_condition = False
if satisfies_condition:
logger.debug("Active Cell %s(%s) for %s", repr(cell), repr(cell_value), r.formula[0])
active_cells.append(cell)
# }}} Test Each Cell #
for c in active_cells:
style_dict[c.coordinate] = [_read_cell_style(st, c, r.dxf) for st in concerned_styles]
logger.debug(".[%s].styles: %s", sheet_name, repr(style_dict))
return style_dict
# }}} function load_xlsx_styles #
# Available Row Properties:
# hidden
# collapsed
# height
#
# Available Column Properties:
# width
# auto_size
# hidden
# collapsed
# min
# max
def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options) \
-> Dict[Union[int, str], Dict[str, Any]]:
# function load_rows_or_cols {{{ #
"""
Args:
xlsx_file (Workbook): concerned excel book
sheet_name (str): sheet name
options (Dict[str, List[str]]): dict like
{"obj": "row" | "column", "props": list of str} giving the concerned
row/column properties
Returns:
Dict[Union[int, str], Dict[str, Any]]: row/column information
"""
try:
worksheet: Worksheet = xlsx_file[sheet_name]
except KeyError:
return {}
objs: DimensionHolder = getattr(worksheet, "{:}_dimensions".format(options["obj"]))
obj_set: Dict[int, Any] = {}
obj_props: Set[str] = set(options.get("props", []))
for obj_no, obj_dms in objs.items():
info_dict: Dict[str, Any] = {}
for prop in obj_props:
info_dict[prop] = getattr(obj_dms, prop)
obj_set[obj_no] = info_dict
return obj_set
# }}} function load_rows_or_cols #
def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
# function load_filters {{{ #
try:
worksheet: Worksheet = xlsx_file[sheet_name]
except KeyError:
return {}
filters: AutoFilter = worksheet.auto_filter
filter_dict: Dict[str, Any] = {}
filter_dict["ref"] = filters.ref
# filterColumn
filter_column_set: List[Dict[str, Any]] = []
for flt_clm in filters.filterColumn:
filter_column: Dict[str, Any] = {}
filter_column["col_id"] = flt_clm.colId
filter_column["hidden_button"] = flt_clm.hiddenButton
filter_column["show_button"] = flt_clm.showButton
if flt_clm.filters is not None:
filter_column["filters_blank"] = flt_clm.filters.blank
filter_column["filters"] = set(flt_clm.filters.filter)
if flt_clm.customFilters is not None:
filter_column["custom_filters_op"] = flt_clm.customFilters._and
filter_column["custom_filters"] = set((flt.operator
, flt.val
) \
for flt in flt_clm.customFilters.customFilter
)
filter_column_set.append(filter_column)
filter_column_set = list(sorted(filter_column_set
, key=(lambda d: d["col_id"])
)
)
filter_dict["filter_column"] = filter_column_set
# sortState
sort_state: Optional[SortState] = filters.sortState
if sort_state is not None:
sort_state_dict: Dict[str, Any] = {}
sort_state_dict["sort"] = sort_state.columnSort
sort_state_dict["case"] = sort_state.caseSensitive
sort_state_dict["method"] = sort_state.sortMethod
sort_state_dict["ref"] = sort_state.ref
sort_state_dict["condition"] = list({"descending": cdt.descending
, "key": cdt.sortBy
, "ref": cdt.ref
, "custom_list": cdt.customList
, "dxf_id": cdt.dxfId
, "icon": cdt.iconSet
, "iconid": cdt.iconId
} \
for cdt in sort_state.sortCondition
)
filter_dict["sort_state"] = sort_state_dict
return filter_dict
# }}} function load_filters #
def _match_record(pattern: Dict[str, Any], item: Dict[str, Any]) -> bool:
return all(k in item and item[k] == val for k, val in pattern.items())
def _multicellrange_containsby(subset_candidate: MultiCellRange, superset_candidate: MultiCellRange) -> bool:
return all(r in superset_candidate for r in subset_candidate)
def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool:
"""
Args:
value (V): value to match
rule (Dict[str, Union[str, V]]): rule dict like
{
"method": str
"ref": V as ref value
}
Returns:
bool
"""
if rule["method"].startswith("re"): # re.FLAGs
flags: List[str] = rule["method"].split(".")[1:]
flags: Iterable[re.RegexFlag] = (getattr(re, fl) for fl in flags)
flag: re.RegexFlag = functools.reduce(operator.or_, flags, re.RegexFlag(0))
logger.debug("REFLAG: %s", repr(flag))
match_: Optional[Match[str]] = re.search(rule["ref"], value, flag)
return match_ is not None
if rule["method"] in {"eq", "ne"
, "le", "lt"
, "ge", "gt"
}:
return getattr(operator, rule["method"])(value, rule["ref"])
if rule["method"].startswith("approx"): # approx:THRESHOLD
threshold: float = float(rule["method"].split(":")[1])
logger.debug("Approx: TH%f, REF%f, VAL%s", threshold, rule["ref"], repr(value))
try:
value = float(value)
except (ValueError, TypeError):
return False
else:
return abs(value - rule["ref"]) <= threshold
if rule["method"] == "spreadsheet_range":
subset_limit = MultiCellRange(rule["ref"][0])
superset_limit = MultiCellRange(rule["ref"][1])
return _multicellrange_containsby(subset_limit, value) \
and _multicellrange_containsby(value, superset_limit)
if rule["method"].startswith("range."): # e.g., range.te [0, 2] -> 0 < x <= 2
left_et = rule["method"][6]
right_et = rule["method"][7]
return getattr(operator, "l" + left_et)(rule["ref"][0], value) \
and getattr(operator, "l" + right_et)(value, rule["ref"][1])
if rule["method"] in {"str_list_eq", "str_set_eq"}:
container_type_str: str = rule["method"][4:-3]
container_type = getattr(builtins, container_type_str)
value: container_type = container_type(value.strip("\"'").split(","))
ref: container_type = container_type(rule["ref"])
return value == ref
raise NotImplementedError()
def are_lists_equal(list1, list2, comparison_func):
# First check if both lists have the same length
if len(list1) != len(list2):
return False
# Now make sure each element in one list has an equal element in the other list
for item1 in list1:
# Use the supplied function to test for an equal item
if not any(comparison_func(item1, item2) for item2 in list2):
return False
# If all items match, the lists are equal
return True
def compare_urls(url1, url2, full=True):
if url1 is None or url2 is None:
return url1 == url2
logger.info(f"compare_urls. url1: {url1}; url2: {url2}")
def parse_with_default_scheme(url):
"""
Ensure the URL has a scheme. If not, prepend 'http://'
so it parses as host + path instead of just a path.
"""
# Regex to check if URL has scheme like 'http://', 'https://', etc.
if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url):
url = f"http://{url}"
return urlparse(url)
def normalize_url(url):
# Parse the URL; if no scheme is present, assume 'http'
parsed_url = parse_with_default_scheme(url)
scheme = parsed_url.scheme.lower()
# Extract the domain parts using tldextract
extracted = tldextract.extract(parsed_url.netloc.lower())
# e.g., extracted = TLDExtractResult(subdomain='www', domain='airbnb', suffix='com.sg')
# Drop 'www' if it's the only subdomain
subdomain = extracted.subdomain
if subdomain == 'www':
subdomain = ''
# Instead of using the suffix (e.g., 'com', 'com.sg'), ignore it completely
# so that both 'airbnb.com' and 'airbnb.com.sg' become just 'airbnb' or 'www.airbnb'
if subdomain:
normalized_netloc = f"{subdomain}.{extracted.domain}"
else:
normalized_netloc = extracted.domain
# Handle trailing slash in the path
normalized_path = parsed_url.path if parsed_url.path != '/' else ''
# Reassemble the URL with the normalized components
normalized_parsed_url = ParseResult(
scheme=scheme.lower(),
netloc=normalized_netloc,
path=normalized_path,
params=parsed_url.params if full else '', # Keep the params
query=parsed_url.query if full else '', # Keep the query string
fragment=parsed_url.fragment if full else '', # Keep the fragment
)
return urlunparse(normalized_parsed_url)
logger.info(f"After normalization. url1: {normalize_url(url1)}; url2: {normalize_url(url2)}")
# Normalize both URLs
norm_url1 = normalize_url(url1)
norm_url2 = normalize_url(url2)
# Compare the normalized URLs
return norm_url1 == norm_url2