use std::str::FromStr;
use tl::HTMLTag;
use tracing::instrument;
use url::Url;
use pep440_rs::VersionSpecifiers;
use pypi_types::LenientVersionSpecifiers;
use pypi_types::{BaseUrl, DistInfoMetadata, File, Hashes, Yanked};
/// A parsed structure from PyPI "HTML" index format for a single package.
#[derive(Debug, Clone)]
pub(crate) struct SimpleHtml {
/// The [`BaseUrl`] to which all relative URLs should be resolved.
pub(crate) base: BaseUrl,
/// The list of [`File`]s available for download sorted by filename.
pub(crate) files: Vec,
}
impl SimpleHtml {
/// Parse the list of [`File`]s from the simple HTML page returned by the given URL.
#[instrument(skip_all, fields(url = % url))]
pub(crate) fn parse(text: &str, url: &Url) -> Result {
let dom = tl::parse(text, tl::ParserOptions::default())?;
// Parse the first `` tag, if any, to determine the base URL to which all
// relative URLs should be resolved. The HTML spec requires that the `` tag
// appear before other tags with attribute values of URLs.
let base = BaseUrl::from(
dom.nodes()
.iter()
.filter_map(|node| node.as_tag())
.take_while(|tag| !matches!(tag.name().as_bytes(), b"a" | b"link"))
.find(|tag| tag.name().as_bytes() == b"base")
.map(|base| Self::parse_base(base))
.transpose()?
.flatten()
.unwrap_or_else(|| url.clone()),
);
// Parse each `` tag, to extract the filename, hash, and URL.
let mut files: Vec = dom
.nodes()
.iter()
.filter_map(|node| node.as_tag())
.filter(|link| link.name().as_bytes() == b"a")
.map(|link| Self::parse_anchor(link))
.collect::, _>>()?;
// While it has not been positively observed, we sort the files
// to ensure we have a defined ordering. Otherwise, if we rely on
// the API to provide a stable ordering and doesn't, it can lead
// non-deterministic behavior elsewhere. (This is somewhat hand-wavy
// and a bit of a band-aide, since arguably, the order of this API
// response probably shouldn't have an impact on things downstream from
// this. That is, if something depends on ordering, then it should
// probably be the thing that does the sorting.)
files.sort_unstable_by(|f1, f2| f1.filename.cmp(&f2.filename));
Ok(Self { base, files })
}
/// Parse the `href` from a `` tag.
fn parse_base(base: &HTMLTag) -> Result