Post-Local sync at 2025-06-23T22:46:07Z
This commit is contained in:
parent
9d33b42020
commit
9f97801b0d
1387 changed files with 250216 additions and 117 deletions
|
@ -1,16 +1,16 @@
|
|||
-----BEGIN PGP SIGNATURE-----
|
||||
|
||||
iQIzBAABCgAdFiEEif0F7T4AkoRgIfM3TifTfDWIcr8FAmhZrGAACgkQTifTfDWI
|
||||
cr+ySQ/9EOf9kNg59Axzic0fs/1wij1AsEHGJjR6qfqJPsjtFylNHMtleewh+qor
|
||||
LuFJlpJFLPJnZSsX+wJEc9LSuLYm2gExHAq013NpPrB8HKx5P0GmmSXIDSbtuQgn
|
||||
JMLb9PjS735/aMudtc2c+f/QS5gPv/djmX0vqp13JM9gkulIxOpzhpZVzs5p5/Ng
|
||||
atAznp1bCLtlB8iGuo9zlk6qqUni/2f5VrJZrL7NgrheV11LUDMe/GK0YksPY98c
|
||||
RHzOQjHz8z5upzNuKHZH1bcssHFFYUcEMrvo0zONa+/B4l5w8PQWMMXc1iWnxhih
|
||||
1Csg0HGI94qGrRgMAtEmG7J/NdCLcW6+PnX5d2CwHqL1kpEub8dbRmEgKo5sAp5X
|
||||
qo/PXgQur7BJ8RT7PTJMprN6Hty21j9b5+hzoiguDbXtbZEDaCygGAMRuO0nQD/a
|
||||
QlfR0oWONJLn4cYHqGZA4E7KPYzRAnFD/qYTL4Es7/C3cImcDwML1nF35hYbUH39
|
||||
q4kDaZfHlN+mItC6mcRT7r6Eyh2JSgAG61tbBN5Ggk6oCdYEUPdrGKH/49qOlB5h
|
||||
mxfmghnBITlAo4YmQOsogYdt1K8Sfs+3AE5utGC9DAph95gLPBcZq/t4oTwG2GnQ
|
||||
dcRzVAo4y7A50VHwrdoea1TxCrBWKl8wsVgM+y48u+FJNmDUUpc=
|
||||
=JOun
|
||||
iQIzBAABCgAdFiEEif0F7T4AkoRgIfM3TifTfDWIcr8FAmhZ2UsACgkQTifTfDWI
|
||||
cr92pA//XFNPmpPzT7okOfJo+WmNVLEX2kUz350UEw9b0+55YWsmySspjyFlKboy
|
||||
ph4MN9goYxdcrzXHNrPlo5Vo9nQRfg1dhDDYgi2ofnjXiVsiHFKY2D0JeUHGZZGa
|
||||
XJxxFeBXHuBFQlbhwCymwGGNIZUvEgaOfj3f7Qq7KdRSklL9u+VXboS94Q8vwt6X
|
||||
mpeQ88jcNfVjOHgniUve8P1rdbVUVCHzv+p03sZhZFknvLxj6gHUjykniERSqKGF
|
||||
lNywbaoS+jgGd9XJqnNb0j/kmfmV3Jne6p8CecBBDpkZTuPY1a8RvG9N+WjYmDrP
|
||||
GwmUHdG32LRlu9lLhipbonfq/NjaNTNqLJn3E4+Kg8ythOpxglAvdgEuTLZkkjjS
|
||||
YlminSueBSKzOS5ZfmceSnpIWq7R/74iQ6KvxbH6I2dutLcPtrv9THKeBhftkwRI
|
||||
awf/snxoYYn5nqAvHa4cpH1jGLOWn8QEKulH99Z26eYm8fd33msyw5L2ttp3y+rO
|
||||
QgY9frnc1qcaT9Nfjzy2noQtQrvu3KYZjudVCXOZeJw06Ytfc9ykFiHkDR6PUQxk
|
||||
tH83nIY8LO0fmrpJU+AUVo2Va1c/HUbJ8yf5/ptBZf2JcbqA56/XzZO2MAi5BLTF
|
||||
Xscsyga7h2fBxcgGfTtN6lCOGm3690JkgDZUT9fpsW1EbGqj9Uk=
|
||||
=Nc8Z
|
||||
-----END PGP SIGNATURE-----
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
-----BEGIN PGP SIGNATURE-----
|
||||
|
||||
iQIzBAABCgAdFiEEif0F7T4AkoRgIfM3TifTfDWIcr8FAmhZrJYACgkQTifTfDWI
|
||||
cr+T1g/9FQMzY9DGhFoesOhawB+HYxdMlI7Q4aNLmT3Vml4eiNkQoy2ODkVSsiSv
|
||||
mDhpIzsI7haTuU7vS6hwsdK7+w7O23S6N3qpllbBNmPdl5AEEt8t8ZPSmB9H9Fj0
|
||||
levIms+SmN7TQW8fiHH3qwnjnIPD/tB2KYbceSIonaGd85TPu4bYLurxGTI61LUn
|
||||
D69kFugTpwwMjYwkAkSJg4AnJwkNx6gY3mu58/FeVMsmEzNusmIi3Maymvn2D0pv
|
||||
fwYr7ZXMQkhzBoi2zg+hQxmX3ugCe1dth8wruPJBY4hcPqaTJ2Si9iIBuVr26PgY
|
||||
wnTEDFSS2eQS12WKNnT3jyBfG/xAhqVXGbRmrah3UcPFBDJWyf51Gj9fVGflKbcp
|
||||
RKdYY0g6mby4EYzcvrdLqcGj4b5/88kq02S/MD8vIzXzWvTg3QFVSmYirJ5T2BiV
|
||||
PYUmcCKQOZycy7oU7BpUF9Cal46Veh1CrbkbvUcHs42PClSEcMI7DvtjmSooaM98
|
||||
EKdJ+GhwdnWLfdYBI6PYeij9A5kOBgh/A6awXuHb11MWtFgEVy7oNULPV7sYTipF
|
||||
bE3eu5HWBvvKvzzvglQWmRaegr9SUzDOrELusVRfpfUMFa8qpcLVf3Xkq+O/PMf5
|
||||
x7Ai+FxwDYbj93wcLwOUSjVaNUDuP7QXN/e9hW8l6pvs6o71D5I=
|
||||
=65sO
|
||||
iQIzBAABCgAdFiEEif0F7T4AkoRgIfM3TifTfDWIcr8FAmhZ2WgACgkQTifTfDWI
|
||||
cr9Cvw/7BY/7zSAk/kot+kH5VfJtSbSpI7W7X1+CIrcP+uBuf3ohq1lgWXFGMu3F
|
||||
Rq44Y1MMsHwXQpbhjXFsr2oWsiLFhnChOpxu2Vju8IJ/lU7ryS82U9e2Sfa5N496
|
||||
8ab/bC7wPHl/wWaoTajEKZ2luYJLpIrpIKBtHw4jZ/mopnvx63PBr2nHXIF6673U
|
||||
YWn0cok+pYMDswbZosfsSrxFJi/7YleNGX7fLGC2alh76xKKN0TBDJf9A2/wlRFb
|
||||
P8+G6P8aFj0Tz1Efh01Wh7gsOAZWKO9EilSv/HdzE0fSpUdn87AgMeywExt2Iiex
|
||||
l5OXsHDranwfD/3KNbR+ZpMJY1JQ4WbisO/ayWJHP5o1IWCesw0HpOTieI+b2TsE
|
||||
h4/TDrRf/4M+lX41yHrpzrj21/sk/B+keU9inP0NO4gOCpxOs6yLWc7QnvZCsrsY
|
||||
wxrxrZSGMDdrWkSQcmTDJZ2yHHaGHSw6+LXrLejjsGWQgpMNsvVP4Dw6EE9v8A+8
|
||||
spTmVG2UwQI8twnhP0s3UcnReOI4bl+yd+40yatDr+1VrsEwMhEtdAJNFf9MO9AA
|
||||
wISr9yIu89BqhZuDkDsVSjvBQuykHVA5Cw6u7GZFFTsV3T3Yv6YCzEYM+uCYMypd
|
||||
zbu2cQQVbP6lbNPdFQ1raNiXubExLdePZmesmhMUvQaSPSrUOXo=
|
||||
=2TnB
|
||||
-----END PGP SIGNATURE-----
|
||||
|
|
|
@ -91,3 +91,7 @@
|
|||
Diff Summary:
|
||||
.gitfield/github.sigil.md | 14 +++++++-------
|
||||
1 file changed, 7 insertions(+), 7 deletions(-)
|
||||
[2025-06-23T22:50:19Z] Local: , Branch=master, Commit=8e56dd4
|
||||
Diff Summary:
|
||||
.gitfield/local.sigil.md | 14 +++++++-------
|
||||
1 file changed, 7 insertions(+), 7 deletions(-)
|
||||
|
|
247
.venv/bin/Activate.ps1
Normal file
247
.venv/bin/Activate.ps1
Normal file
|
@ -0,0 +1,247 @@
|
|||
<#
|
||||
.Synopsis
|
||||
Activate a Python virtual environment for the current PowerShell session.
|
||||
|
||||
.Description
|
||||
Pushes the python executable for a virtual environment to the front of the
|
||||
$Env:PATH environment variable and sets the prompt to signify that you are
|
||||
in a Python virtual environment. Makes use of the command line switches as
|
||||
well as the `pyvenv.cfg` file values present in the virtual environment.
|
||||
|
||||
.Parameter VenvDir
|
||||
Path to the directory that contains the virtual environment to activate. The
|
||||
default value for this is the parent of the directory that the Activate.ps1
|
||||
script is located within.
|
||||
|
||||
.Parameter Prompt
|
||||
The prompt prefix to display when this virtual environment is activated. By
|
||||
default, this prompt is the name of the virtual environment folder (VenvDir)
|
||||
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
|
||||
|
||||
.Example
|
||||
Activate.ps1
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Verbose
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and shows extra information about the activation as it executes.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
|
||||
Activates the Python virtual environment located in the specified location.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Prompt "MyPython"
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and prefixes the current prompt with the specified string (surrounded in
|
||||
parentheses) while the virtual environment is active.
|
||||
|
||||
.Notes
|
||||
On Windows, it may be required to enable this Activate.ps1 script by setting the
|
||||
execution policy for the user. You can do this by issuing the following PowerShell
|
||||
command:
|
||||
|
||||
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||
|
||||
For more information on Execution Policies:
|
||||
https://go.microsoft.com/fwlink/?LinkID=135170
|
||||
|
||||
#>
|
||||
Param(
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$VenvDir,
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$Prompt
|
||||
)
|
||||
|
||||
<# Function declarations --------------------------------------------------- #>
|
||||
|
||||
<#
|
||||
.Synopsis
|
||||
Remove all shell session elements added by the Activate script, including the
|
||||
addition of the virtual environment's Python executable from the beginning of
|
||||
the PATH variable.
|
||||
|
||||
.Parameter NonDestructive
|
||||
If present, do not remove this function from the global namespace for the
|
||||
session.
|
||||
|
||||
#>
|
||||
function global:deactivate ([switch]$NonDestructive) {
|
||||
# Revert to original values
|
||||
|
||||
# The prior prompt:
|
||||
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
|
||||
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
|
||||
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
|
||||
# The prior PYTHONHOME:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
}
|
||||
|
||||
# The prior PATH:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
|
||||
}
|
||||
|
||||
# Just remove the VIRTUAL_ENV altogether:
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV
|
||||
}
|
||||
|
||||
# Just remove VIRTUAL_ENV_PROMPT altogether.
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
|
||||
}
|
||||
|
||||
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
|
||||
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
|
||||
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
|
||||
}
|
||||
|
||||
# Leave deactivate function in the global namespace if requested:
|
||||
if (-not $NonDestructive) {
|
||||
Remove-Item -Path function:deactivate
|
||||
}
|
||||
}
|
||||
|
||||
<#
|
||||
.Description
|
||||
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
|
||||
given folder, and returns them in a map.
|
||||
|
||||
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
|
||||
two strings separated by `=` (with any amount of whitespace surrounding the =)
|
||||
then it is considered a `key = value` line. The left hand string is the key,
|
||||
the right hand is the value.
|
||||
|
||||
If the value starts with a `'` or a `"` then the first and last character is
|
||||
stripped from the value before being captured.
|
||||
|
||||
.Parameter ConfigDir
|
||||
Path to the directory that contains the `pyvenv.cfg` file.
|
||||
#>
|
||||
function Get-PyVenvConfig(
|
||||
[String]
|
||||
$ConfigDir
|
||||
) {
|
||||
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
|
||||
|
||||
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
|
||||
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
|
||||
|
||||
# An empty map will be returned if no config file is found.
|
||||
$pyvenvConfig = @{ }
|
||||
|
||||
if ($pyvenvConfigPath) {
|
||||
|
||||
Write-Verbose "File exists, parse `key = value` lines"
|
||||
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
|
||||
|
||||
$pyvenvConfigContent | ForEach-Object {
|
||||
$keyval = $PSItem -split "\s*=\s*", 2
|
||||
if ($keyval[0] -and $keyval[1]) {
|
||||
$val = $keyval[1]
|
||||
|
||||
# Remove extraneous quotations around a string value.
|
||||
if ("'""".Contains($val.Substring(0, 1))) {
|
||||
$val = $val.Substring(1, $val.Length - 2)
|
||||
}
|
||||
|
||||
$pyvenvConfig[$keyval[0]] = $val
|
||||
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
|
||||
}
|
||||
}
|
||||
}
|
||||
return $pyvenvConfig
|
||||
}
|
||||
|
||||
|
||||
<# Begin Activate script --------------------------------------------------- #>
|
||||
|
||||
# Determine the containing directory of this script
|
||||
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$VenvExecDir = Get-Item -Path $VenvExecPath
|
||||
|
||||
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
|
||||
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
|
||||
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
|
||||
|
||||
# Set values required in priority: CmdLine, ConfigFile, Default
|
||||
# First, get the location of the virtual environment, it might not be
|
||||
# VenvExecDir if specified on the command line.
|
||||
if ($VenvDir) {
|
||||
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
|
||||
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
|
||||
Write-Verbose "VenvDir=$VenvDir"
|
||||
}
|
||||
|
||||
# Next, read the `pyvenv.cfg` file to determine any required value such
|
||||
# as `prompt`.
|
||||
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
|
||||
|
||||
# Next, set the prompt from the command line, or the config file, or
|
||||
# just use the name of the virtual environment folder.
|
||||
if ($Prompt) {
|
||||
Write-Verbose "Prompt specified as argument, using '$Prompt'"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
|
||||
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
|
||||
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
|
||||
$Prompt = $pyvenvCfg['prompt'];
|
||||
}
|
||||
else {
|
||||
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
|
||||
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
|
||||
$Prompt = Split-Path -Path $venvDir -Leaf
|
||||
}
|
||||
}
|
||||
|
||||
Write-Verbose "Prompt = '$Prompt'"
|
||||
Write-Verbose "VenvDir='$VenvDir'"
|
||||
|
||||
# Deactivate any currently active virtual environment, but leave the
|
||||
# deactivate function in place.
|
||||
deactivate -nondestructive
|
||||
|
||||
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
|
||||
# that there is an activated venv.
|
||||
$env:VIRTUAL_ENV = $VenvDir
|
||||
|
||||
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
|
||||
|
||||
Write-Verbose "Setting prompt to '$Prompt'"
|
||||
|
||||
# Set the prompt to include the env name
|
||||
# Make sure _OLD_VIRTUAL_PROMPT is global
|
||||
function global:_OLD_VIRTUAL_PROMPT { "" }
|
||||
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
|
||||
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
|
||||
|
||||
function global:prompt {
|
||||
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
|
||||
_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
$env:VIRTUAL_ENV_PROMPT = $Prompt
|
||||
}
|
||||
|
||||
# Clear PYTHONHOME
|
||||
if (Test-Path -Path Env:PYTHONHOME) {
|
||||
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
Remove-Item -Path Env:PYTHONHOME
|
||||
}
|
||||
|
||||
# Add the venv to the PATH
|
||||
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
|
||||
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
|
70
.venv/bin/activate
Normal file
70
.venv/bin/activate
Normal file
|
@ -0,0 +1,70 @@
|
|||
# This file must be used with "source bin/activate" *from bash*
|
||||
# You cannot run it directly
|
||||
|
||||
deactivate () {
|
||||
# reset old environment variables
|
||||
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
|
||||
PATH="${_OLD_VIRTUAL_PATH:-}"
|
||||
export PATH
|
||||
unset _OLD_VIRTUAL_PATH
|
||||
fi
|
||||
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
|
||||
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
|
||||
export PYTHONHOME
|
||||
unset _OLD_VIRTUAL_PYTHONHOME
|
||||
fi
|
||||
|
||||
# Call hash to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
hash -r 2> /dev/null
|
||||
|
||||
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
|
||||
PS1="${_OLD_VIRTUAL_PS1:-}"
|
||||
export PS1
|
||||
unset _OLD_VIRTUAL_PS1
|
||||
fi
|
||||
|
||||
unset VIRTUAL_ENV
|
||||
unset VIRTUAL_ENV_PROMPT
|
||||
if [ ! "${1:-}" = "nondestructive" ] ; then
|
||||
# Self destruct!
|
||||
unset -f deactivate
|
||||
fi
|
||||
}
|
||||
|
||||
# unset irrelevant variables
|
||||
deactivate nondestructive
|
||||
|
||||
# on Windows, a path can contain colons and backslashes and has to be converted:
|
||||
if [ "${OSTYPE:-}" = "cygwin" ] || [ "${OSTYPE:-}" = "msys" ] ; then
|
||||
# transform D:\path\to\venv to /d/path/to/venv on MSYS
|
||||
# and to /cygdrive/d/path/to/venv on Cygwin
|
||||
export VIRTUAL_ENV=$(cygpath /mnt/c/fieldcraft/witness-fracture/.venv)
|
||||
else
|
||||
# use the path as-is
|
||||
export VIRTUAL_ENV=/mnt/c/fieldcraft/witness-fracture/.venv
|
||||
fi
|
||||
|
||||
_OLD_VIRTUAL_PATH="$PATH"
|
||||
PATH="$VIRTUAL_ENV/"bin":$PATH"
|
||||
export PATH
|
||||
|
||||
# unset PYTHONHOME if set
|
||||
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
|
||||
# could use `if (set -u; : $PYTHONHOME) ;` in bash
|
||||
if [ -n "${PYTHONHOME:-}" ] ; then
|
||||
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
|
||||
unset PYTHONHOME
|
||||
fi
|
||||
|
||||
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
|
||||
_OLD_VIRTUAL_PS1="${PS1:-}"
|
||||
PS1='(.venv) '"${PS1:-}"
|
||||
export PS1
|
||||
VIRTUAL_ENV_PROMPT='(.venv) '
|
||||
export VIRTUAL_ENV_PROMPT
|
||||
fi
|
||||
|
||||
# Call hash to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
hash -r 2> /dev/null
|
27
.venv/bin/activate.csh
Normal file
27
.venv/bin/activate.csh
Normal file
|
@ -0,0 +1,27 @@
|
|||
# This file must be used with "source bin/activate.csh" *from csh*.
|
||||
# You cannot run it directly.
|
||||
|
||||
# Created by Davide Di Blasi <davidedb@gmail.com>.
|
||||
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
|
||||
|
||||
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
setenv VIRTUAL_ENV /mnt/c/fieldcraft/witness-fracture/.venv
|
||||
|
||||
set _OLD_VIRTUAL_PATH="$PATH"
|
||||
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
|
||||
|
||||
|
||||
set _OLD_VIRTUAL_PROMPT="$prompt"
|
||||
|
||||
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
|
||||
set prompt = '(.venv) '"$prompt"
|
||||
setenv VIRTUAL_ENV_PROMPT '(.venv) '
|
||||
endif
|
||||
|
||||
alias pydoc python -m pydoc
|
||||
|
||||
rehash
|
69
.venv/bin/activate.fish
Normal file
69
.venv/bin/activate.fish
Normal file
|
@ -0,0 +1,69 @@
|
|||
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
|
||||
# (https://fishshell.com/). You cannot run it directly.
|
||||
|
||||
function deactivate -d "Exit virtual environment and return to normal shell environment"
|
||||
# reset old environment variables
|
||||
if test -n "$_OLD_VIRTUAL_PATH"
|
||||
set -gx PATH $_OLD_VIRTUAL_PATH
|
||||
set -e _OLD_VIRTUAL_PATH
|
||||
end
|
||||
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
|
||||
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
|
||||
set -e _OLD_VIRTUAL_PYTHONHOME
|
||||
end
|
||||
|
||||
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
|
||||
set -e _OLD_FISH_PROMPT_OVERRIDE
|
||||
# prevents error when using nested fish instances (Issue #93858)
|
||||
if functions -q _old_fish_prompt
|
||||
functions -e fish_prompt
|
||||
functions -c _old_fish_prompt fish_prompt
|
||||
functions -e _old_fish_prompt
|
||||
end
|
||||
end
|
||||
|
||||
set -e VIRTUAL_ENV
|
||||
set -e VIRTUAL_ENV_PROMPT
|
||||
if test "$argv[1]" != "nondestructive"
|
||||
# Self-destruct!
|
||||
functions -e deactivate
|
||||
end
|
||||
end
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
set -gx VIRTUAL_ENV /mnt/c/fieldcraft/witness-fracture/.venv
|
||||
|
||||
set -gx _OLD_VIRTUAL_PATH $PATH
|
||||
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
|
||||
|
||||
# Unset PYTHONHOME if set.
|
||||
if set -q PYTHONHOME
|
||||
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
|
||||
set -e PYTHONHOME
|
||||
end
|
||||
|
||||
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
|
||||
# fish uses a function instead of an env var to generate the prompt.
|
||||
|
||||
# Save the current fish_prompt function as the function _old_fish_prompt.
|
||||
functions -c fish_prompt _old_fish_prompt
|
||||
|
||||
# With the original prompt function renamed, we can override with our own.
|
||||
function fish_prompt
|
||||
# Save the return status of the last command.
|
||||
set -l old_status $status
|
||||
|
||||
# Output the venv prompt; color taken from the blue of the Python logo.
|
||||
printf "%s%s%s" (set_color 4B8BBE) '(.venv) ' (set_color normal)
|
||||
|
||||
# Restore the return status of the previous command.
|
||||
echo "exit $old_status" | .
|
||||
# Output the original/"old" prompt.
|
||||
_old_fish_prompt
|
||||
end
|
||||
|
||||
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
|
||||
set -gx VIRTUAL_ENV_PROMPT '(.venv) '
|
||||
end
|
8
.venv/bin/ia
Normal file
8
.venv/bin/ia
Normal file
|
@ -0,0 +1,8 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from internetarchive.cli.ia import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
41
.venv/bin/jsondiff
Normal file
41
.venv/bin/jsondiff
Normal file
|
@ -0,0 +1,41 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import json
|
||||
import jsonpatch
|
||||
import argparse
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Diff two JSON files')
|
||||
parser.add_argument('FILE1', type=argparse.FileType('r'))
|
||||
parser.add_argument('FILE2', type=argparse.FileType('r'))
|
||||
parser.add_argument('--indent', type=int, default=None,
|
||||
help='Indent output by n spaces')
|
||||
parser.add_argument('-u', '--preserve-unicode', action='store_true',
|
||||
help='Output Unicode character as-is without using Code Point')
|
||||
parser.add_argument('-v', '--version', action='version',
|
||||
version='%(prog)s ' + jsonpatch.__version__)
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
diff_files()
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def diff_files():
|
||||
""" Diffs two JSON files and prints a patch """
|
||||
args = parser.parse_args()
|
||||
doc1 = json.load(args.FILE1)
|
||||
doc2 = json.load(args.FILE2)
|
||||
patch = jsonpatch.make_patch(doc1, doc2)
|
||||
if patch.patch:
|
||||
print(json.dumps(patch.patch, indent=args.indent, ensure_ascii=not(args.preserve_unicode)))
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
107
.venv/bin/jsonpatch
Normal file
107
.venv/bin/jsonpatch
Normal file
|
@ -0,0 +1,107 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
import json
|
||||
import jsonpatch
|
||||
import tempfile
|
||||
import argparse
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Apply a JSON patch on a JSON file')
|
||||
parser.add_argument('ORIGINAL', type=argparse.FileType('r'),
|
||||
help='Original file')
|
||||
parser.add_argument('PATCH', type=argparse.FileType('r'),
|
||||
nargs='?', default=sys.stdin,
|
||||
help='Patch file (read from stdin if omitted)')
|
||||
parser.add_argument('--indent', type=int, default=None,
|
||||
help='Indent output by n spaces')
|
||||
parser.add_argument('-b', '--backup', action='store_true',
|
||||
help='Back up ORIGINAL if modifying in-place')
|
||||
parser.add_argument('-i', '--in-place', action='store_true',
|
||||
help='Modify ORIGINAL in-place instead of to stdout')
|
||||
parser.add_argument('-v', '--version', action='version',
|
||||
version='%(prog)s ' + jsonpatch.__version__)
|
||||
parser.add_argument('-u', '--preserve-unicode', action='store_true',
|
||||
help='Output Unicode character as-is without using Code Point')
|
||||
|
||||
def main():
|
||||
try:
|
||||
patch_files()
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def patch_files():
|
||||
""" Diffs two JSON files and prints a patch """
|
||||
args = parser.parse_args()
|
||||
doc = json.load(args.ORIGINAL)
|
||||
patch = json.load(args.PATCH)
|
||||
result = jsonpatch.apply_patch(doc, patch)
|
||||
|
||||
if args.in_place:
|
||||
dirname = os.path.abspath(os.path.dirname(args.ORIGINAL.name))
|
||||
|
||||
try:
|
||||
# Attempt to replace the file atomically. We do this by
|
||||
# creating a temporary file in the same directory as the
|
||||
# original file so we can atomically move the new file over
|
||||
# the original later. (This is done in the same directory
|
||||
# because atomic renames do not work across mount points.)
|
||||
|
||||
fd, pathname = tempfile.mkstemp(dir=dirname)
|
||||
fp = os.fdopen(fd, 'w')
|
||||
atomic = True
|
||||
|
||||
except OSError:
|
||||
# We failed to create the temporary file for an atomic
|
||||
# replace, so fall back to non-atomic mode by backing up
|
||||
# the original (if desired) and writing a new file.
|
||||
|
||||
if args.backup:
|
||||
os.rename(args.ORIGINAL.name, args.ORIGINAL.name + '.orig')
|
||||
fp = open(args.ORIGINAL.name, 'w')
|
||||
atomic = False
|
||||
|
||||
else:
|
||||
# Since we're not replacing the original file in-place, write
|
||||
# the modified JSON to stdout instead.
|
||||
|
||||
fp = sys.stdout
|
||||
|
||||
# By this point we have some sort of file object we can write the
|
||||
# modified JSON to.
|
||||
|
||||
json.dump(result, fp, indent=args.indent, ensure_ascii=not(args.preserve_unicode))
|
||||
fp.write('\n')
|
||||
|
||||
if args.in_place:
|
||||
# Close the new file. If we aren't replacing atomically, this
|
||||
# is our last step, since everything else is already in place.
|
||||
|
||||
fp.close()
|
||||
|
||||
if atomic:
|
||||
try:
|
||||
# Complete the atomic replace by linking the original
|
||||
# to a backup (if desired), fixing up the permissions
|
||||
# on the temporary file, and moving it into place.
|
||||
|
||||
if args.backup:
|
||||
os.link(args.ORIGINAL.name, args.ORIGINAL.name + '.orig')
|
||||
os.chmod(pathname, os.stat(args.ORIGINAL.name).st_mode)
|
||||
os.rename(pathname, args.ORIGINAL.name)
|
||||
|
||||
except OSError:
|
||||
# In the event we could not actually do the atomic
|
||||
# replace, unlink the original to move it out of the
|
||||
# way and finally move the temporary file into place.
|
||||
|
||||
os.unlink(args.ORIGINAL.name)
|
||||
os.rename(pathname, args.ORIGINAL.name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
67
.venv/bin/jsonpointer
Normal file
67
.venv/bin/jsonpointer
Normal file
|
@ -0,0 +1,67 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
import jsonpointer
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Resolve a JSON pointer on JSON files')
|
||||
|
||||
# Accept pointer as argument or as file
|
||||
ptr_group = parser.add_mutually_exclusive_group(required=True)
|
||||
|
||||
ptr_group.add_argument('-f', '--pointer-file', type=argparse.FileType('r'),
|
||||
nargs='?',
|
||||
help='File containing a JSON pointer expression')
|
||||
|
||||
ptr_group.add_argument('POINTER', type=str, nargs='?',
|
||||
help='A JSON pointer expression')
|
||||
|
||||
parser.add_argument('FILE', type=argparse.FileType('r'), nargs='+',
|
||||
help='Files for which the pointer should be resolved')
|
||||
parser.add_argument('--indent', type=int, default=None,
|
||||
help='Indent output by n spaces')
|
||||
parser.add_argument('-v', '--version', action='version',
|
||||
version='%(prog)s ' + jsonpointer.__version__)
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
resolve_files()
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def parse_pointer(args):
|
||||
if args.POINTER:
|
||||
ptr = args.POINTER
|
||||
elif args.pointer_file:
|
||||
ptr = args.pointer_file.read().strip()
|
||||
else:
|
||||
parser.print_usage()
|
||||
sys.exit(1)
|
||||
|
||||
return ptr
|
||||
|
||||
|
||||
def resolve_files():
|
||||
""" Resolve a JSON pointer on JSON files """
|
||||
args = parser.parse_args()
|
||||
|
||||
ptr = parse_pointer(args)
|
||||
|
||||
for f in args.FILE:
|
||||
doc = json.load(f)
|
||||
try:
|
||||
result = jsonpointer.resolve_pointer(doc, ptr)
|
||||
print(json.dumps(result, indent=args.indent))
|
||||
except jsonpointer.JsonPointerException as e:
|
||||
print('Could not resolve pointer: %s' % str(e), file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
8
.venv/bin/normalizer
Normal file
8
.venv/bin/normalizer
Normal file
|
@ -0,0 +1,8 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from charset_normalizer import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli.cli_detect())
|
8
.venv/bin/pip
Normal file
8
.venv/bin/pip
Normal file
|
@ -0,0 +1,8 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
8
.venv/bin/pip3
Normal file
8
.venv/bin/pip3
Normal file
|
@ -0,0 +1,8 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
8
.venv/bin/pip3.12
Normal file
8
.venv/bin/pip3.12
Normal file
|
@ -0,0 +1,8 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
1
.venv/bin/python
Symbolic link
1
.venv/bin/python
Symbolic link
|
@ -0,0 +1 @@
|
|||
python3
|
1
.venv/bin/python3
Symbolic link
1
.venv/bin/python3
Symbolic link
|
@ -0,0 +1 @@
|
|||
/usr/bin/python3
|
1
.venv/bin/python3.12
Symbolic link
1
.venv/bin/python3.12
Symbolic link
|
@ -0,0 +1 @@
|
|||
python3
|
8
.venv/bin/tqdm
Normal file
8
.venv/bin/tqdm
Normal file
|
@ -0,0 +1,8 @@
|
|||
#!/mnt/c/fieldcraft/witness-fracture/.venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from tqdm.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1 @@
|
|||
pip
|
|
@ -0,0 +1,77 @@
|
|||
Metadata-Version: 2.4
|
||||
Name: certifi
|
||||
Version: 2025.6.15
|
||||
Summary: Python package for providing Mozilla's CA Bundle.
|
||||
Home-page: https://github.com/certifi/python-certifi
|
||||
Author: Kenneth Reitz
|
||||
Author-email: me@kennethreitz.com
|
||||
License: MPL-2.0
|
||||
Project-URL: Source, https://github.com/certifi/python-certifi
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Requires-Python: >=3.7
|
||||
License-File: LICENSE
|
||||
Dynamic: author
|
||||
Dynamic: author-email
|
||||
Dynamic: classifier
|
||||
Dynamic: description
|
||||
Dynamic: home-page
|
||||
Dynamic: license
|
||||
Dynamic: license-file
|
||||
Dynamic: project-url
|
||||
Dynamic: requires-python
|
||||
Dynamic: summary
|
||||
|
||||
Certifi: Python SSL Certificates
|
||||
================================
|
||||
|
||||
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
||||
validating the trustworthiness of SSL certificates while verifying the identity
|
||||
of TLS hosts. It has been extracted from the `Requests`_ project.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
||||
|
||||
$ pip install certifi
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
To reference the installed certificate authority (CA) bundle, you can use the
|
||||
built-in function::
|
||||
|
||||
>>> import certifi
|
||||
|
||||
>>> certifi.where()
|
||||
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
||||
|
||||
Or from the command line::
|
||||
|
||||
$ python -m certifi
|
||||
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
||||
|
||||
Enjoy!
|
||||
|
||||
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
||||
|
||||
Addition/Removal of Certificates
|
||||
--------------------------------
|
||||
|
||||
Certifi does not support any addition/removal or other modification of the
|
||||
CA trust store content. This project is intended to provide a reliable and
|
||||
highly portable root of trust to python deployments. Look to upstream projects
|
||||
for methods to use alternate trust.
|
|
@ -0,0 +1,14 @@
|
|||
certifi-2025.6.15.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
certifi-2025.6.15.dist-info/METADATA,sha256=zgiNbFi9bMv8olVT9OQxB7G5-xQmCjQZ5eO7Z0IGKoI,2423
|
||||
certifi-2025.6.15.dist-info/RECORD,,
|
||||
certifi-2025.6.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
||||
certifi-2025.6.15.dist-info/licenses/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
||||
certifi-2025.6.15.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
||||
certifi/__init__.py,sha256=H8ta5ryBBsJrzYoTklAyzrzsu-dZlfkyL5_aPq5vvFM,94
|
||||
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
||||
certifi/__pycache__/__init__.cpython-312.pyc,,
|
||||
certifi/__pycache__/__main__.cpython-312.pyc,,
|
||||
certifi/__pycache__/core.cpython-312.pyc,,
|
||||
certifi/cacert.pem,sha256=sc3S1mV1jvSdCPQOoT4agm5fBBLp4JQMkh7RAhRkzcI,281225
|
||||
certifi/core.py,sha256=XFXycndG5pf37ayeF8N32HUuDafsyhkVMbO4BAPWHa0,3394
|
||||
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@ -0,0 +1,5 @@
|
|||
Wheel-Version: 1.0
|
||||
Generator: setuptools (80.9.0)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
This package contains a modified version of ca-bundle.crt:
|
||||
|
||||
ca-bundle.crt -- Bundle of CA Root Certificates
|
||||
|
||||
This is a bundle of X.509 certificates of public Certificate Authorities
|
||||
(CA). These were automatically extracted from Mozilla's root certificates
|
||||
file (certdata.txt). This file can be found in the mozilla source tree:
|
||||
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
||||
It contains the certificates in PEM format and therefore
|
||||
can be directly used with curl / libcurl / php_curl, or with
|
||||
an Apache+mod_ssl webserver for SSL client authentication.
|
||||
Just configure this file as the SSLCACertificateFile.#
|
||||
|
||||
***** BEGIN LICENSE BLOCK *****
|
||||
This Source Code Form is subject to the terms of the Mozilla Public License,
|
||||
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
||||
one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
***** END LICENSE BLOCK *****
|
||||
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
|
@ -0,0 +1 @@
|
|||
certifi
|
4
.venv/lib/python3.12/site-packages/certifi/__init__.py
Normal file
4
.venv/lib/python3.12/site-packages/certifi/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
from .core import contents, where
|
||||
|
||||
__all__ = ["contents", "where"]
|
||||
__version__ = "2025.06.15"
|
12
.venv/lib/python3.12/site-packages/certifi/__main__.py
Normal file
12
.venv/lib/python3.12/site-packages/certifi/__main__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
import argparse
|
||||
|
||||
from certifi import contents, where
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-c", "--contents", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.contents:
|
||||
print(contents())
|
||||
else:
|
||||
print(where())
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
4635
.venv/lib/python3.12/site-packages/certifi/cacert.pem
Normal file
4635
.venv/lib/python3.12/site-packages/certifi/cacert.pem
Normal file
File diff suppressed because it is too large
Load diff
83
.venv/lib/python3.12/site-packages/certifi/core.py
Normal file
83
.venv/lib/python3.12/site-packages/certifi/core.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
"""
|
||||
certifi.py
|
||||
~~~~~~~~~~
|
||||
|
||||
This module returns the installation location of cacert.pem or its contents.
|
||||
"""
|
||||
import sys
|
||||
import atexit
|
||||
|
||||
def exit_cacert_ctx() -> None:
|
||||
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
||||
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
|
||||
from importlib.resources import as_file, files
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the file
|
||||
# in cases where we're inside of a zipimport situation until someone
|
||||
# actually calls where(), but we don't want to re-extract the file
|
||||
# on every call of where(), so we'll do it once then store it in a
|
||||
# global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you to
|
||||
# manage the cleanup of this file, so it doesn't actually return a
|
||||
# path, it returns a context manager that will give you the path
|
||||
# when you enter it and will do any cleanup when you leave it. In
|
||||
# the common case of not needing a temporary file, it will just
|
||||
# return the file system location and the __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
atexit.register(exit_cacert_ctx)
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
||||
|
||||
else:
|
||||
|
||||
from importlib.resources import path as get_path, read_text
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the
|
||||
# file in cases where we're inside of a zipimport situation until
|
||||
# someone actually calls where(), but we don't want to re-extract
|
||||
# the file on every call of where(), so we'll do it once then store
|
||||
# it in a global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you
|
||||
# to manage the cleanup of this file, so it doesn't actually
|
||||
# return a path, it returns a context manager that will give
|
||||
# you the path when you enter it and will do any cleanup when
|
||||
# you leave it. In the common case of not needing a temporary
|
||||
# file, it will just return the file system location and the
|
||||
# __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
atexit.register(exit_cacert_ctx)
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
0
.venv/lib/python3.12/site-packages/certifi/py.typed
Normal file
0
.venv/lib/python3.12/site-packages/certifi/py.typed
Normal file
|
@ -0,0 +1 @@
|
|||
pip
|
|
@ -0,0 +1,731 @@
|
|||
Metadata-Version: 2.4
|
||||
Name: charset-normalizer
|
||||
Version: 3.4.2
|
||||
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
||||
Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
||||
Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
||||
License: MIT
|
||||
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
|
||||
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
|
||||
Project-URL: Code, https://github.com/jawah/charset_normalizer
|
||||
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
|
||||
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Text Processing :: Linguistic
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Typing :: Typed
|
||||
Requires-Python: >=3.7
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Provides-Extra: unicode-backport
|
||||
Dynamic: license-file
|
||||
|
||||
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
||||
|
||||
<p align="center">
|
||||
<sup>The Real First Universal Charset Detector</sup><br>
|
||||
<a href="https://pypi.org/project/charset-normalizer">
|
||||
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/charset-normalizer/">
|
||||
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
||||
</a>
|
||||
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
||||
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<sup><i>Featured Packages</i></sup><br>
|
||||
<a href="https://github.com/jawah/niquests">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
|
||||
</a>
|
||||
<a href="https://github.com/jawah/wassima">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
||||
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
||||
> I'm trying to resolve the issue by taking a new approach.
|
||||
> All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
<p align="center">
|
||||
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
||||
</p>
|
||||
|
||||
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
||||
|
||||
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
||||
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
||||
| `Fast` | ❌ | ✅ | ✅ |
|
||||
| `Universal**` | ❌ | ✅ | ❌ |
|
||||
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
||||
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
||||
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
||||
| `Native Python` | ✅ | ✅ | ❌ |
|
||||
| `Detect spoken language` | ❌ | ✅ | N/A |
|
||||
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
||||
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
||||
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
||||
|
||||
<p align="center">
|
||||
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
||||
</p>
|
||||
|
||||
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
||||
|
||||
## ⚡ Performance
|
||||
|
||||
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
||||
|
||||
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
||||
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
||||
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
|
||||
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
||||
|
||||
| Package | 99th percentile | 95th percentile | 50th percentile |
|
||||
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
||||
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
|
||||
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
||||
|
||||
_updated as of december 2024 using CPython 3.12_
|
||||
|
||||
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
||||
|
||||
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
||||
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
||||
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
||||
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
||||
> (e.g. Supported Encoding) Challenge-them if you want.
|
||||
|
||||
## ✨ Installation
|
||||
|
||||
Using pip:
|
||||
|
||||
```sh
|
||||
pip install charset-normalizer -U
|
||||
```
|
||||
|
||||
## 🚀 Basic Usage
|
||||
|
||||
### CLI
|
||||
This package comes with a CLI.
|
||||
|
||||
```
|
||||
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
||||
file [file ...]
|
||||
|
||||
The Real First Universal Charset Detector. Discover originating encoding used
|
||||
on text file. Normalize text to unicode.
|
||||
|
||||
positional arguments:
|
||||
files File(s) to be analysed
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --verbose Display complementary information about file if any.
|
||||
Stdout will contain logs about the detection process.
|
||||
-a, --with-alternative
|
||||
Output complementary possibilities if any. Top-level
|
||||
JSON WILL be a list.
|
||||
-n, --normalize Permit to normalize input file. If not set, program
|
||||
does not write anything.
|
||||
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
||||
JSON output.
|
||||
-r, --replace Replace file when trying to normalize it instead of
|
||||
creating a new one.
|
||||
-f, --force Replace file without asking if you are sure, use this
|
||||
flag with caution.
|
||||
-t THRESHOLD, --threshold THRESHOLD
|
||||
Define a custom maximum amount of chaos allowed in
|
||||
decoded content. 0. <= chaos <= 1.
|
||||
--version Show version information and exit.
|
||||
```
|
||||
|
||||
```bash
|
||||
normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python -m charset_normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
||||
|
||||
```json
|
||||
{
|
||||
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
||||
"encoding": "cp1252",
|
||||
"encoding_aliases": [
|
||||
"1252",
|
||||
"windows_1252"
|
||||
],
|
||||
"alternative_encodings": [
|
||||
"cp1254",
|
||||
"cp1256",
|
||||
"cp1258",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
"mbcs"
|
||||
],
|
||||
"language": "French",
|
||||
"alphabets": [
|
||||
"Basic Latin",
|
||||
"Latin-1 Supplement"
|
||||
],
|
||||
"has_sig_or_bom": false,
|
||||
"chaos": 0.149,
|
||||
"coherence": 97.152,
|
||||
"unicode_path": null,
|
||||
"is_preferred": true
|
||||
}
|
||||
```
|
||||
|
||||
### Python
|
||||
*Just print out normalized text*
|
||||
```python
|
||||
from charset_normalizer import from_path
|
||||
|
||||
results = from_path('./my_subtitle.srt')
|
||||
|
||||
print(str(results.best()))
|
||||
```
|
||||
|
||||
*Upgrade your code without effort*
|
||||
```python
|
||||
from charset_normalizer import detect
|
||||
```
|
||||
|
||||
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
||||
|
||||
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
||||
|
||||
## 😇 Why
|
||||
|
||||
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
||||
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
||||
|
||||
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
||||
produce **two identical rendered string.**
|
||||
What I want is to get readable text, the best I can.
|
||||
|
||||
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
||||
|
||||
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
||||
|
||||
## 🍰 How
|
||||
|
||||
- Discard all charset encoding table that could not fit the binary content.
|
||||
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
||||
- Extract matches with the lowest mess detected.
|
||||
- Additionally, we measure coherence / probe for a language.
|
||||
|
||||
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
||||
|
||||
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
||||
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
|
||||
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
||||
improve or rewrite it.
|
||||
|
||||
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
||||
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
||||
|
||||
## ⚡ Known limitations
|
||||
|
||||
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
||||
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
||||
|
||||
## ⚠️ About Python EOLs
|
||||
|
||||
**If you are running:**
|
||||
|
||||
- Python >=2.7,<3.5: Unsupported
|
||||
- Python 3.5: charset-normalizer < 2.1
|
||||
- Python 3.6: charset-normalizer < 3.1
|
||||
- Python 3.7: charset-normalizer < 4.0
|
||||
|
||||
Upgrade your Python interpreter as soon as possible.
|
||||
|
||||
## 👤 Contributing
|
||||
|
||||
Contributions, issues and feature requests are very much welcome.<br />
|
||||
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
||||
|
||||
## 📝 License
|
||||
|
||||
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
||||
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
||||
|
||||
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
||||
|
||||
## 💼 For Enterprise
|
||||
|
||||
Professional support for charset-normalizer is available as part of the [Tidelift
|
||||
Subscription][1]. Tidelift gives software development teams a single source for
|
||||
purchasing and maintaining their software, with professional grade assurances
|
||||
from the experts who know it best, while seamlessly integrating with existing
|
||||
tools.
|
||||
|
||||
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
||||
|
||||
[](https://www.bestpractices.dev/projects/7297)
|
||||
|
||||
# Changelog
|
||||
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
|
||||
|
||||
### Fixed
|
||||
- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
|
||||
- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
|
||||
|
||||
### Changed
|
||||
- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
|
||||
|
||||
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
|
||||
|
||||
### Changed
|
||||
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
|
||||
- Enforce annotation delayed loading for a simpler and consistent types in the project.
|
||||
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
|
||||
|
||||
### Added
|
||||
- pre-commit configuration.
|
||||
- noxfile.
|
||||
|
||||
### Removed
|
||||
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
|
||||
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
|
||||
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
|
||||
- Unused `utils.range_scan` function.
|
||||
|
||||
### Fixed
|
||||
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
|
||||
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
|
||||
|
||||
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
|
||||
|
||||
### Added
|
||||
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
|
||||
- Support for Python 3.13 (#512)
|
||||
|
||||
### Fixed
|
||||
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
|
||||
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
|
||||
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
|
||||
|
||||
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
||||
|
||||
### Fixed
|
||||
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
||||
- Regression on some detection case showcased in the documentation (#371)
|
||||
|
||||
### Added
|
||||
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
||||
|
||||
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
||||
|
||||
### Changed
|
||||
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
||||
- Improved the general detection reliability based on reports from the community
|
||||
|
||||
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
||||
|
||||
### Added
|
||||
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
||||
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
||||
|
||||
### Removed
|
||||
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
||||
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
||||
|
||||
### Changed
|
||||
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
||||
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
||||
|
||||
### Fixed
|
||||
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
||||
|
||||
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
||||
|
||||
### Changed
|
||||
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
||||
- Minor improvement over the global detection reliability
|
||||
|
||||
### Added
|
||||
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
||||
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
||||
- Explicit support for Python 3.12
|
||||
|
||||
### Fixed
|
||||
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
||||
|
||||
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
||||
|
||||
### Added
|
||||
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
||||
|
||||
### Removed
|
||||
- Support for Python 3.6 (PR #260)
|
||||
|
||||
### Changed
|
||||
- Optional speedup provided by mypy/c 1.0.1
|
||||
|
||||
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
||||
|
||||
### Fixed
|
||||
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
||||
|
||||
### Changed
|
||||
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
||||
|
||||
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
||||
|
||||
### Added
|
||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
||||
|
||||
### Changed
|
||||
- Build with static metadata using 'build' frontend
|
||||
- Make the language detection stricter
|
||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
||||
|
||||
### Fixed
|
||||
- CLI with opt --normalize fail when using full path for files
|
||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
||||
- Sphinx warnings when generating the documentation
|
||||
|
||||
### Removed
|
||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
||||
- Breaking: Top-level function `normalize`
|
||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
||||
- Support for the backport `unicodedata2`
|
||||
|
||||
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
||||
|
||||
### Added
|
||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
||||
|
||||
### Changed
|
||||
- Build with static metadata using 'build' frontend
|
||||
- Make the language detection stricter
|
||||
|
||||
### Fixed
|
||||
- CLI with opt --normalize fail when using full path for files
|
||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
||||
|
||||
### Removed
|
||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
||||
|
||||
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
||||
|
||||
### Added
|
||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
||||
|
||||
### Removed
|
||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
||||
|
||||
### Fixed
|
||||
- Sphinx warnings when generating the documentation
|
||||
|
||||
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
||||
|
||||
### Changed
|
||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
||||
|
||||
### Removed
|
||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
||||
- Breaking: Top-level function `normalize`
|
||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
||||
- Support for the backport `unicodedata2`
|
||||
|
||||
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
||||
|
||||
### Deprecated
|
||||
- Function `normalize` scheduled for removal in 3.0
|
||||
|
||||
### Changed
|
||||
- Removed useless call to decode in fn is_unprintable (#206)
|
||||
|
||||
### Fixed
|
||||
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
||||
|
||||
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
||||
|
||||
### Added
|
||||
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
||||
|
||||
### Changed
|
||||
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
||||
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
||||
|
||||
### Fixed
|
||||
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
||||
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
||||
|
||||
### Removed
|
||||
- Support for Python 3.5 (PR #192)
|
||||
|
||||
### Deprecated
|
||||
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
||||
|
||||
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
||||
|
||||
### Fixed
|
||||
- ASCII miss-detection on rare cases (PR #170)
|
||||
|
||||
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
||||
|
||||
### Added
|
||||
- Explicit support for Python 3.11 (PR #164)
|
||||
|
||||
### Changed
|
||||
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
||||
|
||||
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
||||
|
||||
### Fixed
|
||||
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
||||
|
||||
### Changed
|
||||
- Skipping the language-detection (CD) on ASCII (PR #155)
|
||||
|
||||
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
||||
|
||||
### Changed
|
||||
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
||||
|
||||
### Fixed
|
||||
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
||||
|
||||
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
||||
### Changed
|
||||
- Improvement over Vietnamese detection (PR #126)
|
||||
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
||||
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
||||
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
||||
- Code style as refactored by Sourcery-AI (PR #131)
|
||||
- Minor adjustment on the MD around european words (PR #133)
|
||||
- Remove and replace SRTs from assets / tests (PR #139)
|
||||
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
||||
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
||||
|
||||
### Fixed
|
||||
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
||||
- Avoid using too insignificant chunk (PR #137)
|
||||
|
||||
### Added
|
||||
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
||||
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
||||
|
||||
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
||||
### Added
|
||||
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
||||
|
||||
### Changed
|
||||
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
||||
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
||||
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
||||
- Various detection improvement (MD+CD) (PR #117)
|
||||
|
||||
### Removed
|
||||
- Remove redundant logging entry about detected language(s) (PR #115)
|
||||
|
||||
### Fixed
|
||||
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
||||
|
||||
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
||||
### Fixed
|
||||
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
||||
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
||||
|
||||
### Changed
|
||||
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
||||
|
||||
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
||||
### Changed
|
||||
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
||||
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
||||
- The Unicode detection is slightly improved (PR #93)
|
||||
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
||||
|
||||
### Removed
|
||||
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
||||
|
||||
### Fixed
|
||||
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
||||
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
||||
- The MANIFEST.in was not exhaustive (PR #78)
|
||||
|
||||
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
||||
### Fixed
|
||||
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
||||
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
||||
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
||||
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
||||
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
||||
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
||||
|
||||
### Changed
|
||||
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
||||
- Allow fallback on specified encoding if any (PR #71)
|
||||
|
||||
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
||||
### Changed
|
||||
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
||||
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
||||
|
||||
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
||||
### Fixed
|
||||
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
||||
|
||||
### Changed
|
||||
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
||||
|
||||
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
||||
### Fixed
|
||||
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
||||
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
||||
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
||||
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
||||
|
||||
### Changed
|
||||
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
||||
|
||||
### Added
|
||||
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
||||
|
||||
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
||||
### Changed
|
||||
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
||||
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
||||
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
||||
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
||||
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
||||
- utf_7 detection has been reinstated.
|
||||
|
||||
### Removed
|
||||
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
||||
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
||||
- The exception hook on UnicodeDecodeError has been removed.
|
||||
|
||||
### Deprecated
|
||||
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
||||
|
||||
### Fixed
|
||||
- The CLI output used the relative path of the file(s). Should be absolute.
|
||||
|
||||
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
||||
### Fixed
|
||||
- Logger configuration/usage no longer conflict with others (PR #44)
|
||||
|
||||
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
||||
### Removed
|
||||
- Using standard logging instead of using the package loguru.
|
||||
- Dropping nose test framework in favor of the maintained pytest.
|
||||
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
||||
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
||||
- Stop support for UTF-7 that does not contain a SIG.
|
||||
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
||||
|
||||
### Fixed
|
||||
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
||||
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
||||
|
||||
### Changed
|
||||
- Improving the package final size by compressing frequencies.json.
|
||||
- Huge improvement over the larges payload.
|
||||
|
||||
### Added
|
||||
- CLI now produces JSON consumable output.
|
||||
- Return ASCII if given sequences fit. Given reasonable confidence.
|
||||
|
||||
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
||||
|
||||
### Fixed
|
||||
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
||||
|
||||
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
||||
|
||||
### Fixed
|
||||
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
||||
|
||||
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
||||
|
||||
### Fixed
|
||||
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
||||
|
||||
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
||||
|
||||
### Changed
|
||||
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
||||
|
||||
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
||||
|
||||
### Fixed
|
||||
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
||||
|
||||
### Changed
|
||||
- Dependencies refactoring, constraints revised.
|
||||
|
||||
### Added
|
||||
- Add python 3.9 and 3.10 to the supported interpreters
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,35 @@
|
|||
../../../bin/normalizer,sha256=ps2PJvH01MlyWVzHF_Ow1qWMxFsTBsJ72zHxQvwkP4A,262
|
||||
charset_normalizer-3.4.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
charset_normalizer-3.4.2.dist-info/METADATA,sha256=zNVWE4rQW-YZIMHKSayMypu37bLj_xayLorHl0-HAHQ,35743
|
||||
charset_normalizer-3.4.2.dist-info/RECORD,,
|
||||
charset_normalizer-3.4.2.dist-info/WHEEL,sha256=escAp0nQBKBkW_DNq5usPAJtWAtnzdSwKaeRcFbdm08,151
|
||||
charset_normalizer-3.4.2.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
|
||||
charset_normalizer-3.4.2.dist-info/licenses/LICENSE,sha256=bQ1Bv-FwrGx9wkjJpj4lTQ-0WmDVCoJX0K-SxuJJuIc,1071
|
||||
charset_normalizer-3.4.2.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
||||
charset_normalizer/__init__.py,sha256=OKRxRv2Zhnqk00tqkN0c1BtJjm165fWXLydE52IKuHc,1590
|
||||
charset_normalizer/__main__.py,sha256=yzYxMR-IhKRHYwcSlavEv8oGdwxsR89mr2X09qXGdps,109
|
||||
charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/api.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/cd.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/constant.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/md.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/models.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/utils.cpython-312.pyc,,
|
||||
charset_normalizer/__pycache__/version.cpython-312.pyc,,
|
||||
charset_normalizer/api.py,sha256=qBRz8mJ_R5E713R6TOyqHEdnmyxbEDnCSHvx32ubDGg,22617
|
||||
charset_normalizer/cd.py,sha256=WKTo1HDb-H9HfCDc3Bfwq5jzS25Ziy9SE2a74SgTq88,12522
|
||||
charset_normalizer/cli/__init__.py,sha256=D8I86lFk2-py45JvqxniTirSj_sFyE6sjaY_0-G1shc,136
|
||||
charset_normalizer/cli/__main__.py,sha256=dMaXG6IJXRvqq8z2tig7Qb83-BpWTln55ooiku5_uvg,12646
|
||||
charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
|
||||
charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
|
||||
charset_normalizer/constant.py,sha256=7UVY4ldYhmQMHUdgQ_sgZmzcQ0xxYxpBunqSZ-XJZ8U,42713
|
||||
charset_normalizer/legacy.py,sha256=SZE_AJujOYB1y9Y3-FkFOW9Ye4H1EHTzPbZR8DEsgl8,2287
|
||||
charset_normalizer/md.cpython-312-x86_64-linux-gnu.so,sha256=u_tcYtClfGbflvoDRpoO4ph7Uao7zmkdgF2Q0Rf1GNU,16120
|
||||
charset_normalizer/md.py,sha256=-_oN3h3_X99nkFfqamD3yu45DC_wfk5odH0Tr_CQiXs,20145
|
||||
charset_normalizer/md__mypyc.cpython-312-x86_64-linux-gnu.so,sha256=Vd-fx1sAk98CzIjJ2N72eqUZlX1W2rO6r3BG2-xJT0I,280856
|
||||
charset_normalizer/models.py,sha256=lKXhOnIPtiakbK3i__J9wpOfzx3JDTKj7Dn3Rg0VaRI,12394
|
||||
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/utils.py,sha256=sTejPgrdlNsKNucZfJCxJ95lMTLA0ShHLLE3n5wpT9Q,12170
|
||||
charset_normalizer/version.py,sha256=koLXlDwKAU5IlYP4qkOpFnsncn74hlphHAlBhlDKzyw,115
|
|
@ -0,0 +1,6 @@
|
|||
Wheel-Version: 1.0
|
||||
Generator: setuptools (80.1.0)
|
||||
Root-Is-Purelib: false
|
||||
Tag: cp312-cp312-manylinux_2_17_x86_64
|
||||
Tag: cp312-cp312-manylinux2014_x86_64
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
[console_scripts]
|
||||
normalizer = charset_normalizer:cli.cli_detect
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2025 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1 @@
|
|||
charset_normalizer
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
The Real First Universal Charset Detector.
|
||||
A library that helps you read text from an unknown charset encoding.
|
||||
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||
All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
Basic usage:
|
||||
>>> from charset_normalizer import from_bytes
|
||||
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||
>>> best_guess = results.best()
|
||||
>>> str(best_guess)
|
||||
'Bсеки човек има право на образование. Oбразованието!'
|
||||
|
||||
Others methods and usages are available - see the full documentation
|
||||
at <https://github.com/Ousret/charset_normalizer>.
|
||||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, is_binary
|
||||
from .legacy import detect
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"is_binary",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
|
@ -0,0 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .cli import cli_detect
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
668
.venv/lib/python3.12/site-packages/charset_normalizer/api.py
Normal file
668
.venv/lib/python3.12/site-packages/charset_normalizer/api.py
Normal file
|
@ -0,0 +1,668 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from os import PathLike
|
||||
from typing import BinaryIO
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
mb_encoding_languages,
|
||||
merge_coherence_ratios,
|
||||
)
|
||||
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
||||
from .md import mess_ratio
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_cp_similar,
|
||||
is_multi_byte_encoding,
|
||||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
|
||||
def from_bytes(
|
||||
sequences: bytes | bytearray,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
but never take it for granted. Can improve the performance.
|
||||
|
||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_isolation is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding allowed : %s.",
|
||||
", ".join(cp_isolation),
|
||||
)
|
||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||
else:
|
||||
cp_isolation = []
|
||||
|
||||
if cp_exclusion is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding excluded : %s.",
|
||||
", ".join(cp_exclusion),
|
||||
)
|
||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||
else:
|
||||
cp_exclusion = []
|
||||
|
||||
if length <= (chunk_size * steps):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||
steps,
|
||||
chunk_size,
|
||||
length,
|
||||
)
|
||||
steps = 1
|
||||
chunk_size = length
|
||||
|
||||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
elif is_too_large_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: list[str] = []
|
||||
|
||||
specified_encoding: str | None = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: set[str] = set()
|
||||
tested_but_hard_failure: list[str] = []
|
||||
tested_but_soft_failure: list[str] = []
|
||||
|
||||
fallback_ascii: CharsetMatch | None = None
|
||||
fallback_u8: CharsetMatch | None = None
|
||||
fallback_specified: CharsetMatch | None = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
early_stop_results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
if sig_encoding is not None:
|
||||
prioritized_encodings.append(sig_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||
len(sig_payload),
|
||||
sig_encoding,
|
||||
)
|
||||
|
||||
prioritized_encodings.append("ascii")
|
||||
|
||||
if "utf_8" not in prioritized_encodings:
|
||||
prioritized_encodings.append("utf_8")
|
||||
|
||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
||||
if cp_isolation and encoding_iana not in cp_isolation:
|
||||
continue
|
||||
|
||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||
continue
|
||||
|
||||
if encoding_iana in tested:
|
||||
continue
|
||||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: str | None = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s does not provide an IncrementalDecoder",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||
str(
|
||||
(
|
||||
sequences[: int(50e4)]
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) : int(50e4)]
|
||||
),
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
else:
|
||||
decoded_payload = str(
|
||||
(
|
||||
sequences
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) :]
|
||||
),
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
if not isinstance(e, LookupError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
similar_soft_failure_test: bool = False
|
||||
|
||||
for encoding_soft_failed in tested_but_soft_failure:
|
||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||
similar_soft_failure_test = True
|
||||
break
|
||||
|
||||
if similar_soft_failure_test:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
||||
encoding_iana,
|
||||
encoding_soft_failed,
|
||||
)
|
||||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||
"was encoded using n-bytes.",
|
||||
encoding_iana,
|
||||
)
|
||||
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: list[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(
|
||||
mess_ratio(
|
||||
chunk,
|
||||
threshold,
|
||||
explain is True and 1 <= len(cp_isolation) <= 2,
|
||||
)
|
||||
)
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except (
|
||||
UnicodeDecodeError
|
||||
) as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
if (
|
||||
not lazy_str_hard_failure
|
||||
and is_too_large_sequence
|
||||
and not is_multi_byte_decoder
|
||||
):
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||
"Computed mean chaos is %f %%.",
|
||||
encoding_iana,
|
||||
early_stop_count,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
# Preparing those fallbacks in case we got nothing.
|
||||
if (
|
||||
enable_fallback
|
||||
and encoding_iana in ["ascii", "utf_8", specified_encoding]
|
||||
and not lazy_str_hard_failure
|
||||
):
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
threshold,
|
||||
False,
|
||||
[],
|
||||
decoded_payload,
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||
encoding_iana,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: list[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
if target_languages:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"{} should target any language(s) of {}".format(
|
||||
encoding_iana, str(target_languages)
|
||||
),
|
||||
)
|
||||
|
||||
cd_ratios = []
|
||||
|
||||
# We shall skip the CD when its about ASCII
|
||||
# Most of the time its not relevant to run "language-detection" on it.
|
||||
if encoding_iana != "ascii":
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk,
|
||||
language_threshold,
|
||||
",".join(target_languages) if target_languages else None,
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
|
||||
if cd_ratios_merged:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"We detected language {} using {}".format(
|
||||
cd_ratios_merged, encoding_iana
|
||||
),
|
||||
)
|
||||
|
||||
current_match = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
bom_or_sig_available,
|
||||
cd_ratios_merged,
|
||||
(
|
||||
decoded_payload
|
||||
if (
|
||||
is_too_large_sequence is False
|
||||
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
)
|
||||
else None
|
||||
),
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
|
||||
results.append(current_match)
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and mean_mess_ratio < 0.1
|
||||
):
|
||||
# If md says nothing to worry about, then... stop immediately!
|
||||
if mean_mess_ratio == 0.0:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
current_match.encoding,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([current_match])
|
||||
|
||||
early_stop_results.append(current_match)
|
||||
|
||||
if (
|
||||
len(early_stop_results)
|
||||
and (specified_encoding is None or specified_encoding in tested)
|
||||
and "ascii" in tested
|
||||
and "utf_8" in tested
|
||||
):
|
||||
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
probable_result.encoding,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return CharsetMatches([probable_result])
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
||||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
||||
)
|
||||
|
||||
if fallback_specified:
|
||||
logger.debug(
|
||||
"Encoding detection: %s will be used as a fallback match",
|
||||
fallback_specified.encoding,
|
||||
)
|
||||
results.append(fallback_specified)
|
||||
elif (
|
||||
(fallback_u8 and fallback_ascii is None)
|
||||
or (
|
||||
fallback_u8
|
||||
and fallback_ascii
|
||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||
)
|
||||
or (fallback_u8 is not None)
|
||||
):
|
||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
||||
results.append(fallback_u8)
|
||||
elif fallback_ascii:
|
||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if results:
|
||||
logger.debug(
|
||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
||||
results.best().encoding, # type: ignore
|
||||
len(results) - 1,
|
||||
)
|
||||
else:
|
||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def from_fp(
|
||||
fp: BinaryIO,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
Will not close the file pointer.
|
||||
"""
|
||||
return from_bytes(
|
||||
fp.read(),
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def from_path(
|
||||
path: str | bytes | PathLike, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
Can raise IOError.
|
||||
"""
|
||||
with open(path, "rb") as fp:
|
||||
return from_fp(
|
||||
fp,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def is_binary(
|
||||
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
||||
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
||||
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
||||
"""
|
||||
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
||||
guesses = from_path(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
elif isinstance(
|
||||
fp_or_path_or_payload,
|
||||
(
|
||||
bytes,
|
||||
bytearray,
|
||||
),
|
||||
):
|
||||
guesses = from_bytes(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
else:
|
||||
guesses = from_fp(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
|
||||
return not guesses
|
395
.venv/lib/python3.12/site-packages/charset_normalizer/cd.py
Normal file
395
.venv/lib/python3.12/site-packages/charset_normalizer/cd.py
Normal file
|
@ -0,0 +1,395 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter
|
||||
|
||||
from .constant import (
|
||||
FREQUENCIES,
|
||||
KO_NAMES,
|
||||
LANGUAGE_SUPPORTED_COUNT,
|
||||
TOO_SMALL_SEQUENCE,
|
||||
ZH_NAMES,
|
||||
)
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_latin,
|
||||
is_multi_byte_encoding,
|
||||
is_unicode_range_secondary,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise OSError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: str | None = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
if is_unicode_range_secondary(character_range) is False:
|
||||
if character_range not in seen_ranges:
|
||||
seen_ranges[character_range] = 0
|
||||
seen_ranges[character_range] += 1
|
||||
character_count += 1
|
||||
|
||||
return sorted(
|
||||
[
|
||||
character_range
|
||||
for character_range in seen_ranges
|
||||
if seen_ranges[character_range] / character_count >= 0.15
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> list[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: list[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
if unicode_range(character) == primary_range:
|
||||
languages.append(language)
|
||||
break
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: str | None = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
primary_range = specified_range
|
||||
break
|
||||
|
||||
if primary_range is None:
|
||||
return ["Latin Based"]
|
||||
|
||||
return unicode_range_languages(primary_range)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
if (
|
||||
iana_name.startswith("shift_")
|
||||
or iana_name.startswith("iso2022_jp")
|
||||
or iana_name.startswith("euc_j")
|
||||
or iana_name == "cp932"
|
||||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: list[str], ignore_non_latin: bool = False
|
||||
) -> list[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: list[tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
||||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count: int = len(
|
||||
[c for c in language_characters if c in characters]
|
||||
)
|
||||
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||
|
||||
return [compatible_language[0] for compatible_language in languages]
|
||||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: list[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError(f"{language} not available")
|
||||
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
ordered_characters_count: int = len(ordered_characters)
|
||||
target_language_characters_count: int = len(FREQUENCIES[language])
|
||||
|
||||
large_alphabet: bool = target_language_characters_count > 26
|
||||
|
||||
for character, character_rank in zip(
|
||||
ordered_characters, range(0, ordered_characters_count)
|
||||
):
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
||||
expected_projection_ratio: float = (
|
||||
target_language_characters_count / ordered_characters_count
|
||||
)
|
||||
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
||||
|
||||
if (
|
||||
large_alphabet is False
|
||||
and abs(character_rank_projection - character_rank_in_language) > 4
|
||||
):
|
||||
continue
|
||||
|
||||
if (
|
||||
large_alphabet is True
|
||||
and abs(character_rank_projection - character_rank_in_language)
|
||||
< target_language_characters_count / 3
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
characters_before_source: list[str] = FREQUENCIES[language][
|
||||
0:character_rank_in_language
|
||||
]
|
||||
characters_after_source: list[str] = FREQUENCIES[language][
|
||||
character_rank_in_language:
|
||||
]
|
||||
characters_before: list[str] = ordered_characters[0:character_rank]
|
||||
characters_after: list[str] = ordered_characters[character_rank:]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
)
|
||||
|
||||
after_match_count: int = len(
|
||||
set(characters_after) & set(characters_after_source)
|
||||
)
|
||||
|
||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if len(characters_after_source) == 0 and after_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
before_match_count / len(characters_before_source) >= 0.4
|
||||
or after_match_count / len(characters_after_source) >= 0.4
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range: str | None = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
is_suspiciously_successive_range(discovered_range, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = discovered_range
|
||||
break
|
||||
|
||||
if layer_target_range is None:
|
||||
layer_target_range = character_range
|
||||
|
||||
if layer_target_range not in layers:
|
||||
layers[layer_target_range] = character.lower()
|
||||
continue
|
||||
|
||||
layers[layer_target_range] += character.lower()
|
||||
|
||||
return list(layers.values())
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: dict[str, list[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
if language not in per_language_ratios:
|
||||
per_language_ratios[language] = [ratio]
|
||||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
||||
"""
|
||||
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
||||
of "English". This function only keeps the best match and remove the em-dash in it.
|
||||
"""
|
||||
index_results: dict[str, list[float]] = dict()
|
||||
|
||||
for result in results:
|
||||
language, ratio = result
|
||||
no_em_name: str = language.replace("—", "")
|
||||
|
||||
if no_em_name not in index_results:
|
||||
index_results[no_em_name] = []
|
||||
|
||||
index_results[no_em_name].append(ratio)
|
||||
|
||||
if any(len(index_results[e]) > 1 for e in index_results):
|
||||
filtered_results: CoherenceMatches = []
|
||||
|
||||
for language in index_results:
|
||||
filtered_results.append((language, max(index_results[language])))
|
||||
|
||||
return filtered_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: list[tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count: int = sum(o for c, o in most_common)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: list[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
elif ratio >= 0.8:
|
||||
sufficient_match_count += 1
|
||||
|
||||
results.append((language, round(ratio, 4)))
|
||||
|
||||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(
|
||||
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
||||
)
|
|
@ -0,0 +1,8 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .__main__ import cli_detect, query_yes_no
|
||||
|
||||
__all__ = (
|
||||
"cli_detect",
|
||||
"query_yes_no",
|
||||
)
|
|
@ -0,0 +1,381 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import typing
|
||||
from json import dumps
|
||||
from os.path import abspath, basename, dirname, join, realpath
|
||||
from platform import python_version
|
||||
from unicodedata import unidata_version
|
||||
|
||||
import charset_normalizer.md as md_module
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||
"""Ask a yes/no question via input() and return their answer.
|
||||
|
||||
"question" is a string that is presented to the user.
|
||||
"default" is the presumed answer if the user just hits <Enter>.
|
||||
It must be "yes" (the default), "no" or None (meaning
|
||||
an answer is required of the user).
|
||||
|
||||
The "answer" return value is True for "yes" or False for "no".
|
||||
|
||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||
"""
|
||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||
if default is None:
|
||||
prompt = " [y/n] "
|
||||
elif default == "yes":
|
||||
prompt = " [Y/n] "
|
||||
elif default == "no":
|
||||
prompt = " [y/N] "
|
||||
else:
|
||||
raise ValueError("invalid default answer: '%s'" % default)
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question + prompt)
|
||||
choice = input().lower()
|
||||
if default is not None and choice == "":
|
||||
return valid[default]
|
||||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
|
||||
|
||||
|
||||
class FileType:
|
||||
"""Factory for creating file object types
|
||||
|
||||
Instances of FileType are typically passed as type= arguments to the
|
||||
ArgumentParser add_argument() method.
|
||||
|
||||
Keyword Arguments:
|
||||
- mode -- A string indicating how the file is to be opened. Accepts the
|
||||
same values as the builtin open() function.
|
||||
- bufsize -- The file's desired buffer size. Accepts the same values as
|
||||
the builtin open() function.
|
||||
- encoding -- The file's encoding. Accepts the same values as the
|
||||
builtin open() function.
|
||||
- errors -- A string indicating how encoding and decoding errors are to
|
||||
be handled. Accepts the same value as the builtin open() function.
|
||||
|
||||
Backported from CPython 3.12
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "r",
|
||||
bufsize: int = -1,
|
||||
encoding: str | None = None,
|
||||
errors: str | None = None,
|
||||
):
|
||||
self._mode = mode
|
||||
self._bufsize = bufsize
|
||||
self._encoding = encoding
|
||||
self._errors = errors
|
||||
|
||||
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
|
||||
# the special argument "-" means sys.std{in,out}
|
||||
if string == "-":
|
||||
if "r" in self._mode:
|
||||
return sys.stdin.buffer if "b" in self._mode else sys.stdin
|
||||
elif any(c in self._mode for c in "wax"):
|
||||
return sys.stdout.buffer if "b" in self._mode else sys.stdout
|
||||
else:
|
||||
msg = f'argument "-" with mode {self._mode}'
|
||||
raise ValueError(msg)
|
||||
|
||||
# all other arguments are used as file names
|
||||
try:
|
||||
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
|
||||
except OSError as e:
|
||||
message = f"can't open '{string}': {e}"
|
||||
raise argparse.ArgumentTypeError(message)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
args = self._mode, self._bufsize
|
||||
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
|
||||
args_str = ", ".join(
|
||||
[repr(arg) for arg in args if arg != -1]
|
||||
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
|
||||
)
|
||||
return f"{type(self).__name__}({args_str})"
|
||||
|
||||
|
||||
def cli_detect(argv: list[str] | None = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--no-preemptive",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="no_preemptive",
|
||||
help="Disable looking at a charset declaration to hint the detector.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
||||
__version__,
|
||||
python_version(),
|
||||
unidata_version,
|
||||
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
matches = from_fp(
|
||||
my_file,
|
||||
threshold=args.threshold,
|
||||
explain=args.verbose,
|
||||
preemptive_behaviour=args.no_preemptive is False,
|
||||
)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
(
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else ""
|
||||
),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
dir_path = dirname(realpath(my_file.name))
|
||||
file_name = basename(realpath(my_file.name))
|
||||
|
||||
o_: list[str] = file_name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
||||
|
||||
with open(x_[0].unicode_path, "wb") as fp:
|
||||
fp.write(best_guess.output())
|
||||
except OSError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
Binary file not shown.
Binary file not shown.
2015
.venv/lib/python3.12/site-packages/charset_normalizer/constant.py
Normal file
2015
.venv/lib/python3.12/site-packages/charset_normalizer/constant.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,64 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from warnings import warn
|
||||
|
||||
from .api import from_bytes
|
||||
from .constant import CHARDET_CORRESPONDENCE
|
||||
|
||||
# TODO: remove this check when dropping Python 3.7 support
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
class ResultDict(TypedDict):
|
||||
encoding: str | None
|
||||
language: str
|
||||
confidence: float | None
|
||||
|
||||
|
||||
def detect(
|
||||
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
||||
) -> ResultDict:
|
||||
"""
|
||||
chardet legacy method
|
||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||
further information. Not planned for removal.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
"""
|
||||
if len(kwargs):
|
||||
warn(
|
||||
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
||||
)
|
||||
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
byte_str = bytes(byte_str)
|
||||
|
||||
r = from_bytes(byte_str).best()
|
||||
|
||||
encoding = r.encoding if r is not None else None
|
||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||
confidence = 1.0 - r.chaos if r is not None else None
|
||||
|
||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||
if r is not None and encoding == "utf_8" and r.bom:
|
||||
encoding += "_sig"
|
||||
|
||||
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
||||
encoding = CHARDET_CORRESPONDENCE[encoding]
|
||||
|
||||
return {
|
||||
"encoding": encoding,
|
||||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
Binary file not shown.
635
.venv/lib/python3.12/site-packages/charset_normalizer/md.py
Normal file
635
.venv/lib/python3.12/site-packages/charset_normalizer/md.py
Normal file
|
@ -0,0 +1,635 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from logging import getLogger
|
||||
|
||||
from .constant import (
|
||||
COMMON_SAFE_ASCII_CHARACTERS,
|
||||
TRACE,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
)
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_arabic,
|
||||
is_arabic_isolated_form,
|
||||
is_case_variable,
|
||||
is_cjk,
|
||||
is_emoticon,
|
||||
is_hangul,
|
||||
is_hiragana,
|
||||
is_katakana,
|
||||
is_latin,
|
||||
is_punctuation,
|
||||
is_separator,
|
||||
is_symbol,
|
||||
is_thai,
|
||||
is_unprintable,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
is_cjk_uncommon,
|
||||
)
|
||||
|
||||
|
||||
class MessDetectorPlugin:
|
||||
"""
|
||||
Base abstract class used for mess detection plugins.
|
||||
All detectors MUST extend and implement given methods.
|
||||
"""
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
"""
|
||||
Determine if given character should be fed in.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
"""
|
||||
The main routine to be executed upon character.
|
||||
Insert the logic in witch the text would be considered chaotic.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
"""
|
||||
Permit to reset the plugin to the initial state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
"""
|
||||
Compute the chaos ratio based on what your feed() has seen.
|
||||
Must NOT be lower than 0.; No restriction gt 0.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
|
||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._punctuation_count: int = 0
|
||||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char: str | None = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character != self._last_printable_char
|
||||
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
if is_punctuation(character):
|
||||
self._punctuation_count += 1
|
||||
elif (
|
||||
character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
and is_emoticon(character) is False
|
||||
):
|
||||
self._symbol_count += 2
|
||||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_punctuation: float = (
|
||||
self._punctuation_count + self._symbol_count
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||
|
||||
|
||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._accentuated_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_accentuated(character):
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
||||
|
||||
class UnprintablePlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._unprintable_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if is_unprintable(character):
|
||||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._unprintable_count * 8) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character: str | None = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha() and is_latin(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
if (
|
||||
self._last_latin_character is not None
|
||||
and is_accentuated(character)
|
||||
and is_accentuated(self._last_latin_character)
|
||||
):
|
||||
if character.isupper() and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
# Worse if its the same char duplicated with different accent.
|
||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
||||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._successive_count * 2) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousRange(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: str | None = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character.isspace()
|
||||
or is_punctuation(character)
|
||||
or character in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
self._last_printable_seen = None
|
||||
return
|
||||
|
||||
if self._last_printable_seen is None:
|
||||
self._last_printable_seen = character
|
||||
return
|
||||
|
||||
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: str | None = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
||||
self._last_printable_seen = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count <= 13:
|
||||
return 0.0
|
||||
|
||||
ratio_of_suspicious_range_usage: float = (
|
||||
self._suspicious_successive_range_count * 2
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_suspicious_range_usage
|
||||
|
||||
|
||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._word_count: int = 0
|
||||
self._bad_word_count: int = 0
|
||||
self._foreign_long_count: int = 0
|
||||
|
||||
self._is_current_word_bad: bool = False
|
||||
self._foreign_long_watch: bool = False
|
||||
|
||||
self._character_count: int = 0
|
||||
self._bad_character_count: int = 0
|
||||
|
||||
self._buffer: str = ""
|
||||
self._buffer_accent_count: int = 0
|
||||
self._buffer_glyph_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character.isalpha():
|
||||
self._buffer += character
|
||||
if is_accentuated(character):
|
||||
self._buffer_accent_count += 1
|
||||
if (
|
||||
self._foreign_long_watch is False
|
||||
and (is_latin(character) is False or is_accentuated(character))
|
||||
and is_cjk(character) is False
|
||||
and is_hangul(character) is False
|
||||
and is_katakana(character) is False
|
||||
and is_hiragana(character) is False
|
||||
and is_thai(character) is False
|
||||
):
|
||||
self._foreign_long_watch = True
|
||||
if (
|
||||
is_cjk(character)
|
||||
or is_hangul(character)
|
||||
or is_katakana(character)
|
||||
or is_hiragana(character)
|
||||
or is_thai(character)
|
||||
):
|
||||
self._buffer_glyph_count += 1
|
||||
return
|
||||
if not self._buffer:
|
||||
return
|
||||
if (
|
||||
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||
) and self._buffer:
|
||||
self._word_count += 1
|
||||
buffer_length: int = len(self._buffer)
|
||||
|
||||
self._character_count += buffer_length
|
||||
|
||||
if buffer_length >= 4:
|
||||
if self._buffer_accent_count / buffer_length >= 0.5:
|
||||
self._is_current_word_bad = True
|
||||
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
||||
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
||||
elif (
|
||||
is_accentuated(self._buffer[-1])
|
||||
and self._buffer[-1].isupper()
|
||||
and all(_.isupper() for _ in self._buffer) is False
|
||||
):
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
elif self._buffer_glyph_count == 1:
|
||||
self._is_current_word_bad = True
|
||||
self._foreign_long_count += 1
|
||||
if buffer_length >= 24 and self._foreign_long_watch:
|
||||
camel_case_dst = [
|
||||
i
|
||||
for c, i in zip(self._buffer, range(0, buffer_length))
|
||||
if c.isupper()
|
||||
]
|
||||
probable_camel_cased: bool = False
|
||||
|
||||
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
||||
probable_camel_cased = True
|
||||
|
||||
if not probable_camel_cased:
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
|
||||
if self._is_current_word_bad:
|
||||
self._bad_word_count += 1
|
||||
self._bad_character_count += len(self._buffer)
|
||||
self._is_current_word_bad = False
|
||||
|
||||
self._foreign_long_watch = False
|
||||
self._buffer = ""
|
||||
self._buffer_accent_count = 0
|
||||
self._buffer_glyph_count = 0
|
||||
elif (
|
||||
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
||||
and character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
):
|
||||
self._is_current_word_bad = True
|
||||
self._buffer += character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._buffer = ""
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
self._bad_word_count = 0
|
||||
self._word_count = 0
|
||||
self._character_count = 0
|
||||
self._bad_character_count = 0
|
||||
self._foreign_long_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._bad_character_count / self._character_count
|
||||
|
||||
|
||||
class CjkUncommonPlugin(MessDetectorPlugin):
|
||||
"""
|
||||
Detect messy CJK text that probably means nothing.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._uncommon_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return is_cjk(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_cjk_uncommon(character):
|
||||
self._uncommon_count += 1
|
||||
return
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._uncommon_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
uncommon_form_usage: float = self._uncommon_count / self._character_count
|
||||
|
||||
# we can be pretty sure it's garbage when uncommon characters are widely
|
||||
# used. otherwise it could just be traditional chinese for example.
|
||||
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
|
||||
|
||||
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._buf: bool = False
|
||||
|
||||
self._character_count_since_last_sep: int = 0
|
||||
|
||||
self._successive_upper_lower_count: int = 0
|
||||
self._successive_upper_lower_count_final: int = 0
|
||||
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen: str | None = None
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
is_concerned = character.isalpha() and is_case_variable(character)
|
||||
chunk_sep = is_concerned is False
|
||||
|
||||
if chunk_sep and self._character_count_since_last_sep > 0:
|
||||
if (
|
||||
self._character_count_since_last_sep <= 64
|
||||
and character.isdigit() is False
|
||||
and self._current_ascii_only is False
|
||||
):
|
||||
self._successive_upper_lower_count_final += (
|
||||
self._successive_upper_lower_count
|
||||
)
|
||||
|
||||
self._successive_upper_lower_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._character_count += 1
|
||||
self._current_ascii_only = True
|
||||
|
||||
return
|
||||
|
||||
if self._current_ascii_only is True and character.isascii() is False:
|
||||
self._current_ascii_only = False
|
||||
|
||||
if self._last_alpha_seen is not None:
|
||||
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
||||
character.islower() and self._last_alpha_seen.isupper()
|
||||
):
|
||||
if self._buf is True:
|
||||
self._successive_upper_lower_count += 2
|
||||
self._buf = False
|
||||
else:
|
||||
self._buf = True
|
||||
else:
|
||||
self._buf = False
|
||||
|
||||
self._character_count += 1
|
||||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
self._successive_upper_lower_count_final = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._current_ascii_only = True
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._successive_upper_lower_count_final / self._character_count
|
||||
|
||||
|
||||
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._isolated_form_count: int = 0
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._isolated_form_count = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return is_arabic(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_arabic_isolated_form(character):
|
||||
self._isolated_form_count += 1
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
||||
|
||||
return isolated_form_usage
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: str | None, unicode_range_b: str | None
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||
"""
|
||||
if unicode_range_a is None or unicode_range_b is None:
|
||||
return True
|
||||
|
||||
if unicode_range_a == unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||
return False
|
||||
|
||||
# Latin characters can be accompanied with a combining diacritical mark
|
||||
# eg. Vietnamese.
|
||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = (
|
||||
unicode_range_a.split(" "),
|
||||
unicode_range_b.split(" "),
|
||||
)
|
||||
|
||||
for el in keywords_range_a:
|
||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
continue
|
||||
if el in keywords_range_b:
|
||||
return False
|
||||
|
||||
# Japanese Exception
|
||||
range_a_jp_chars, range_b_jp_chars = (
|
||||
unicode_range_a
|
||||
in (
|
||||
"Hiragana",
|
||||
"Katakana",
|
||||
),
|
||||
unicode_range_b in ("Hiragana", "Katakana"),
|
||||
)
|
||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
|
||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
||||
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
||||
unicode_range_a in ["Katakana", "Hiragana"]
|
||||
and unicode_range_b in ["Katakana", "Hiragana"]
|
||||
):
|
||||
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
||||
return False
|
||||
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def mess_ratio(
|
||||
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
detectors: list[MessDetectorPlugin] = [
|
||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
]
|
||||
|
||||
length: int = len(decoded_sequence) + 1
|
||||
|
||||
mean_mess_ratio: float = 0.0
|
||||
|
||||
if length < 512:
|
||||
intermediary_mean_mess_ratio_calc: int = 32
|
||||
elif length <= 1024:
|
||||
intermediary_mean_mess_ratio_calc = 64
|
||||
else:
|
||||
intermediary_mean_mess_ratio_calc = 128
|
||||
|
||||
for character, index in zip(decoded_sequence + "\n", range(length)):
|
||||
for detector in detectors:
|
||||
if detector.eligible(character):
|
||||
detector.feed(character)
|
||||
|
||||
if (
|
||||
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
||||
) or index == length - 1:
|
||||
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
||||
|
||||
if mean_mess_ratio >= maximum_threshold:
|
||||
break
|
||||
|
||||
if debug:
|
||||
logger = getLogger("charset_normalizer")
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Mess-detector extended-analysis start. "
|
||||
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
||||
f"maximum_threshold={maximum_threshold}",
|
||||
)
|
||||
|
||||
if len(decoded_sequence) > 16:
|
||||
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
||||
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
||||
|
||||
for dt in detectors:
|
||||
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
Binary file not shown.
360
.venv/lib/python3.12/site-packages/charset_normalizer/models.py
Normal file
360
.venv/lib/python3.12/site-packages/charset_normalizer/models.py
Normal file
|
@ -0,0 +1,360 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from encodings.aliases import aliases
|
||||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import Any, Iterator, List, Tuple
|
||||
|
||||
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
||||
|
||||
class CharsetMatch:
|
||||
def __init__(
|
||||
self,
|
||||
payload: bytes,
|
||||
guessed_encoding: str,
|
||||
mean_mess_ratio: float,
|
||||
has_sig_or_bom: bool,
|
||||
languages: CoherenceMatches,
|
||||
decoded_payload: str | None = None,
|
||||
preemptive_declaration: str | None = None,
|
||||
):
|
||||
self._payload: bytes = payload
|
||||
|
||||
self._encoding: str = guessed_encoding
|
||||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: list[str] | None = None
|
||||
|
||||
self._leaves: list[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload: bytes | None = None
|
||||
self._output_encoding: str | None = None
|
||||
|
||||
self._string: str | None = decoded_payload
|
||||
|
||||
self._preemptive_declaration: str | None = preemptive_declaration
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
if isinstance(other, str):
|
||||
return iana_name(other) == self.encoding
|
||||
return False
|
||||
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
||||
|
||||
def __lt__(self, other: object) -> bool:
|
||||
"""
|
||||
Implemented to make sorted available upon CharsetMatches items.
|
||||
"""
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise ValueError
|
||||
|
||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Below 1% difference --> Use Coherence
|
||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||
return self.coherence > other.coherence
|
||||
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
||||
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
||||
# preserve RAM usage!
|
||||
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
||||
return self.chaos < other.chaos
|
||||
return self.multi_byte_usage > other.multi_byte_usage
|
||||
|
||||
return self.chaos < other.chaos
|
||||
|
||||
@property
|
||||
def multi_byte_usage(self) -> float:
|
||||
return 1.0 - (len(str(self)) / len(self.raw))
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Lazy Str Loading
|
||||
if self._string is None:
|
||||
self._string = str(self._payload, self._encoding, "strict")
|
||||
return self._string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
||||
|
||||
def add_submatch(self, other: CharsetMatch) -> None:
|
||||
if not isinstance(other, CharsetMatch) or other == self:
|
||||
raise ValueError(
|
||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||
other.__class__
|
||||
)
|
||||
)
|
||||
|
||||
other._string = None # Unload RAM usage; dirty trick.
|
||||
self._leaves.append(other)
|
||||
|
||||
@property
|
||||
def encoding(self) -> str:
|
||||
return self._encoding
|
||||
|
||||
@property
|
||||
def encoding_aliases(self) -> list[str]:
|
||||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as: list[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
elif self.encoding == p:
|
||||
also_known_as.append(u)
|
||||
return also_known_as
|
||||
|
||||
@property
|
||||
def bom(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def byte_order_mark(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def languages(self) -> list[str]:
|
||||
"""
|
||||
Return the complete list of possible languages found in decoded sequence.
|
||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||
"""
|
||||
return [e[0] for e in self._languages]
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
"""
|
||||
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
||||
"Unknown".
|
||||
"""
|
||||
if not self._languages:
|
||||
# Trying to infer the language based on the given encoding
|
||||
# Its either English or we should not pronounce ourselves in certain cases.
|
||||
if "ascii" in self.could_be_from_charset:
|
||||
return "English"
|
||||
|
||||
# doing it there to avoid circular import
|
||||
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
||||
|
||||
languages = (
|
||||
mb_encoding_languages(self.encoding)
|
||||
if is_multi_byte_encoding(self.encoding)
|
||||
else encoding_languages(self.encoding)
|
||||
)
|
||||
|
||||
if len(languages) == 0 or "Latin Based" in languages:
|
||||
return "Unknown"
|
||||
|
||||
return languages[0]
|
||||
|
||||
return self._languages[0][0]
|
||||
|
||||
@property
|
||||
def chaos(self) -> float:
|
||||
return self._mean_mess_ratio
|
||||
|
||||
@property
|
||||
def coherence(self) -> float:
|
||||
if not self._languages:
|
||||
return 0.0
|
||||
return self._languages[0][1]
|
||||
|
||||
@property
|
||||
def percent_chaos(self) -> float:
|
||||
return round(self.chaos * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def percent_coherence(self) -> float:
|
||||
return round(self.coherence * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def raw(self) -> bytes:
|
||||
"""
|
||||
Original untouched bytes.
|
||||
"""
|
||||
return self._payload
|
||||
|
||||
@property
|
||||
def submatch(self) -> list[CharsetMatch]:
|
||||
return self._leaves
|
||||
|
||||
@property
|
||||
def has_submatch(self) -> bool:
|
||||
return len(self._leaves) > 0
|
||||
|
||||
@property
|
||||
def alphabets(self) -> list[str]:
|
||||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
||||
@property
|
||||
def could_be_from_charset(self) -> list[str]:
|
||||
"""
|
||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||
encoding.
|
||||
This list does include the encoding available in property 'encoding'.
|
||||
"""
|
||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||
|
||||
def output(self, encoding: str = "utf_8") -> bytes:
|
||||
"""
|
||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||
Any errors will be simply ignored by the encoder NOT replaced.
|
||||
"""
|
||||
if self._output_encoding is None or self._output_encoding != encoding:
|
||||
self._output_encoding = encoding
|
||||
decoded_string = str(self)
|
||||
if (
|
||||
self._preemptive_declaration is not None
|
||||
and self._preemptive_declaration.lower()
|
||||
not in ["utf-8", "utf8", "utf_8"]
|
||||
):
|
||||
patched_header = sub(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
||||
m.groups()[0],
|
||||
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
||||
),
|
||||
decoded_string[:8192],
|
||||
count=1,
|
||||
)
|
||||
|
||||
decoded_string = patched_header + decoded_string[8192:]
|
||||
|
||||
self._output_payload = decoded_string.encode(encoding, "replace")
|
||||
|
||||
return self._output_payload # type: ignore
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""
|
||||
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
||||
"""
|
||||
return sha256(self.output()).hexdigest()
|
||||
|
||||
|
||||
class CharsetMatches:
|
||||
"""
|
||||
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
||||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: list[CharsetMatch] | None = None):
|
||||
self._results: list[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: int | str) -> CharsetMatch:
|
||||
"""
|
||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||
Raise KeyError upon invalid index or encoding not present in results.
|
||||
"""
|
||||
if isinstance(item, int):
|
||||
return self._results[item]
|
||||
if isinstance(item, str):
|
||||
item = iana_name(item, False)
|
||||
for result in self._results:
|
||||
if item in result.could_be_from_charset:
|
||||
return result
|
||||
raise KeyError
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._results)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self._results) > 0
|
||||
|
||||
def append(self, item: CharsetMatch) -> None:
|
||||
"""
|
||||
Insert a single match. Will be inserted accordingly to preserve sort.
|
||||
Can be inserted as a submatch.
|
||||
"""
|
||||
if not isinstance(item, CharsetMatch):
|
||||
raise ValueError(
|
||||
"Cannot append instance '{}' to CharsetMatches".format(
|
||||
str(item.__class__)
|
||||
)
|
||||
)
|
||||
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
||||
if len(item.raw) < TOO_BIG_SEQUENCE:
|
||||
for match in self._results:
|
||||
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
||||
match.add_submatch(item)
|
||||
return
|
||||
self._results.append(item)
|
||||
self._results = sorted(self._results)
|
||||
|
||||
def best(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Simply return the first match. Strict equivalent to matches[0].
|
||||
"""
|
||||
if not self._results:
|
||||
return None
|
||||
return self._results[0]
|
||||
|
||||
def first(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Redundant method, call the method best(). Kept for BC reasons.
|
||||
"""
|
||||
return self.best()
|
||||
|
||||
|
||||
CoherenceMatch = Tuple[str, float]
|
||||
CoherenceMatches = List[CoherenceMatch]
|
||||
|
||||
|
||||
class CliDetectionResult:
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
encoding: str | None,
|
||||
encoding_aliases: list[str],
|
||||
alternative_encodings: list[str],
|
||||
language: str,
|
||||
alphabets: list[str],
|
||||
has_sig_or_bom: bool,
|
||||
chaos: float,
|
||||
coherence: float,
|
||||
unicode_path: str | None,
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path: str = path
|
||||
self.unicode_path: str | None = unicode_path
|
||||
self.encoding: str | None = encoding
|
||||
self.encoding_aliases: list[str] = encoding_aliases
|
||||
self.alternative_encodings: list[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: list[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> dict[str, Any]: # type: ignore
|
||||
return {
|
||||
"path": self.path,
|
||||
"encoding": self.encoding,
|
||||
"encoding_aliases": self.encoding_aliases,
|
||||
"alternative_encodings": self.alternative_encodings,
|
||||
"language": self.language,
|
||||
"alphabets": self.alphabets,
|
||||
"has_sig_or_bom": self.has_sig_or_bom,
|
||||
"chaos": self.chaos,
|
||||
"coherence": self.coherence,
|
||||
"unicode_path": self.unicode_path,
|
||||
"is_preferred": self.is_preferred,
|
||||
}
|
||||
|
||||
def to_json(self) -> str:
|
||||
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
414
.venv/lib/python3.12/site-packages/charset_normalizer/utils.py
Normal file
414
.venv/lib/python3.12/site-packages/charset_normalizer/utils.py
Normal file
|
@ -0,0 +1,414 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import unicodedata
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import Generator
|
||||
|
||||
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
IANA_SUPPORTED_SIMILAR,
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
UNICODE_RANGES_COMBINED,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
UTF8_MAXIMAL_ALLOCATION,
|
||||
COMMON_CJK_CHARACTERS,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_accentuated(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
return (
|
||||
"WITH GRAVE" in description
|
||||
or "WITH ACUTE" in description
|
||||
or "WITH CEDILLA" in description
|
||||
or "WITH DIAERESIS" in description
|
||||
or "WITH CIRCUMFLEX" in description
|
||||
or "WITH TILDE" in description
|
||||
or "WITH MACRON" in description
|
||||
or "WITH RING ABOVE" in description
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def remove_accent(character: str) -> str:
|
||||
decomposed: str = unicodedata.decomposition(character)
|
||||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes: list[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def unicode_range(character: str) -> str | None:
|
||||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
character_ord: int = ord(character)
|
||||
|
||||
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||
if character_ord in ord_range:
|
||||
return range_name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_latin(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
return "LATIN" in description
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_punctuation(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Punctuation" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_symbol(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Forms" in character_range and character_category != "Lo"
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Emoticons" in character_range or "Pictographs" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_separator(character: str) -> bool:
|
||||
if character.isspace() or character in {"|", "+", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_case_variable(character: str) -> bool:
|
||||
return character.islower() != character.isupper()
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "CJK" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hiragana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "HIRAGANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_katakana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "KATAKANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hangul(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "HANGUL" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_thai(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "THAI" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_arabic(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "ARABIC" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_arabic_isolated_form(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk_uncommon(character: str) -> bool:
|
||||
return character not in COMMON_CJK_CHARACTERS
|
||||
|
||||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_unprintable(character: str) -> bool:
|
||||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1a" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
"""
|
||||
if not isinstance(sequence, bytes):
|
||||
raise TypeError
|
||||
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results: list[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
)
|
||||
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
for specified_encoding in results:
|
||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if encoding_alias == specified_encoding:
|
||||
return encoding_iana
|
||||
if encoding_iana == specified_encoding:
|
||||
return encoding_iana
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def is_multi_byte_encoding(name: str) -> bool:
|
||||
"""
|
||||
Verify is a specific encoding is a multi byte one based on it IANA name
|
||||
"""
|
||||
return name in {
|
||||
"utf_8",
|
||||
"utf_8_sig",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_le",
|
||||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
||||
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
|
||||
"""
|
||||
Identify and extract SIG/BOM in given sequence.
|
||||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
||||
for mark in marks:
|
||||
if sequence.startswith(mark):
|
||||
return iana_encoding, mark
|
||||
|
||||
return None, b""
|
||||
|
||||
|
||||
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||
return iana_encoding not in {"utf_16", "utf_32"}
|
||||
|
||||
|
||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
||||
|
||||
return cp_name
|
||||
|
||||
|
||||
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
||||
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
||||
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
||||
character_match_count: int = 0
|
||||
|
||||
for i in range(255):
|
||||
to_be_decoded: bytes = bytes([i])
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
||||
return character_match_count / 254
|
||||
|
||||
|
||||
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
||||
"""
|
||||
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
||||
the function cp_similarity.
|
||||
"""
|
||||
return (
|
||||
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||
)
|
||||
|
||||
|
||||
def set_logging_handler(
|
||||
name: str = "charset_normalizer",
|
||||
level: int = logging.INFO,
|
||||
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
||||
) -> None:
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def cut_sequence_chunks(
|
||||
sequences: bytes,
|
||||
encoding_iana: str,
|
||||
offsets: range,
|
||||
chunk_size: int,
|
||||
bom_or_sig_available: bool,
|
||||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: str | None = None,
|
||||
) -> Generator[str, None, None]:
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
chunk = decoded_payload[i : i + chunk_size]
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
else:
|
||||
for i in offsets:
|
||||
chunk_end = i + chunk_size
|
||||
if chunk_end > len(sequences) + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
)
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0:
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j:chunk_end]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
yield chunk
|
|
@ -0,0 +1,8 @@
|
|||
"""
|
||||
Expose version
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
__version__ = "3.4.2"
|
||||
VERSION = __version__.split(".")
|
|
@ -0,0 +1 @@
|
|||
pip
|
|
@ -0,0 +1,31 @@
|
|||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2013-2024, Kim Davies and contributors.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
250
.venv/lib/python3.12/site-packages/idna-3.10.dist-info/METADATA
Normal file
250
.venv/lib/python3.12/site-packages/idna-3.10.dist-info/METADATA
Normal file
|
@ -0,0 +1,250 @@
|
|||
Metadata-Version: 2.1
|
||||
Name: idna
|
||||
Version: 3.10
|
||||
Summary: Internationalized Domain Names in Applications (IDNA)
|
||||
Author-email: Kim Davies <kim+pypi@gumleaf.org>
|
||||
Requires-Python: >=3.6
|
||||
Description-Content-Type: text/x-rst
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Intended Audience :: System Administrators
|
||||
Classifier: License :: OSI Approved :: BSD License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Internet :: Name Service (DNS)
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Utilities
|
||||
Requires-Dist: ruff >= 0.6.2 ; extra == "all"
|
||||
Requires-Dist: mypy >= 1.11.2 ; extra == "all"
|
||||
Requires-Dist: pytest >= 8.3.2 ; extra == "all"
|
||||
Requires-Dist: flake8 >= 7.1.1 ; extra == "all"
|
||||
Project-URL: Changelog, https://github.com/kjd/idna/blob/master/HISTORY.rst
|
||||
Project-URL: Issue tracker, https://github.com/kjd/idna/issues
|
||||
Project-URL: Source, https://github.com/kjd/idna
|
||||
Provides-Extra: all
|
||||
|
||||
Internationalized Domain Names in Applications (IDNA)
|
||||
=====================================================
|
||||
|
||||
Support for the Internationalized Domain Names in
|
||||
Applications (IDNA) protocol as specified in `RFC 5891
|
||||
<https://tools.ietf.org/html/rfc5891>`_. This is the latest version of
|
||||
the protocol and is sometimes referred to as “IDNA 2008”.
|
||||
|
||||
This library also provides support for Unicode Technical
|
||||
Standard 46, `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_.
|
||||
|
||||
This acts as a suitable replacement for the “encodings.idna”
|
||||
module that comes with the Python standard library, but which
|
||||
only supports the older superseded IDNA specification (`RFC 3490
|
||||
<https://tools.ietf.org/html/rfc3490>`_).
|
||||
|
||||
Basic functions are simply executed:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
This package is available for installation from PyPI:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ python3 -m pip install idna
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
For typical usage, the ``encode`` and ``decode`` functions will take a
|
||||
domain name argument and perform a conversion to A-labels or U-labels
|
||||
respectively.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
You may use the codec encoding and decoding methods using the
|
||||
``idna.codec`` module:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna.codec
|
||||
>>> print('домен.испытание'.encode('idna2008'))
|
||||
b'xn--d1acufc.xn--80akhbyknj4f'
|
||||
>>> print(b'xn--d1acufc.xn--80akhbyknj4f'.decode('idna2008'))
|
||||
домен.испытание
|
||||
|
||||
Conversions can be applied at a per-label basis using the ``ulabel`` or
|
||||
``alabel`` functions if necessary:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> idna.alabel('测试')
|
||||
b'xn--0zwm56d'
|
||||
|
||||
Compatibility Mapping (UTS #46)
|
||||
+++++++++++++++++++++++++++++++
|
||||
|
||||
As described in `RFC 5895 <https://tools.ietf.org/html/rfc5895>`_, the
|
||||
IDNA specification does not normalize input from different potential
|
||||
ways a user may input a domain name. This functionality, known as
|
||||
a “mapping”, is considered by the specification to be a local
|
||||
user-interface issue distinct from IDNA conversion functionality.
|
||||
|
||||
This library provides one such mapping that was developed by the
|
||||
Unicode Consortium. Known as `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_, it provides for both a regular
|
||||
mapping for typical applications, as well as a transitional mapping to
|
||||
help migrate from older IDNA 2003 applications. Strings are
|
||||
preprocessed according to Section 4.4 “Preprocessing for IDNA2008”
|
||||
prior to the IDNA operations.
|
||||
|
||||
For example, “Königsgäßchen” is not a permissible label as *LATIN
|
||||
CAPITAL LETTER K* is not allowed (nor are capital letters in general).
|
||||
UTS 46 will convert this into lower case prior to applying the IDNA
|
||||
conversion.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> import idna
|
||||
>>> idna.encode('Königsgäßchen')
|
||||
...
|
||||
idna.core.InvalidCodepoint: Codepoint U+004B at position 1 of 'Königsgäßchen' not allowed
|
||||
>>> idna.encode('Königsgäßchen', uts46=True)
|
||||
b'xn--knigsgchen-b4a3dun'
|
||||
>>> print(idna.decode('xn--knigsgchen-b4a3dun'))
|
||||
königsgäßchen
|
||||
|
||||
Transitional processing provides conversions to help transition from
|
||||
the older 2003 standard to the current standard. For example, in the
|
||||
original IDNA specification, the *LATIN SMALL LETTER SHARP S* (ß) was
|
||||
converted into two *LATIN SMALL LETTER S* (ss), whereas in the current
|
||||
IDNA specification this conversion is not performed.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> idna.encode('Königsgäßchen', uts46=True, transitional=True)
|
||||
'xn--knigsgsschen-lcb0w'
|
||||
|
||||
Implementers should use transitional processing with caution, only in
|
||||
rare cases where conversion from legacy labels to current labels must be
|
||||
performed (i.e. IDNA implementations that pre-date 2008). For typical
|
||||
applications that just need to convert labels, transitional processing
|
||||
is unlikely to be beneficial and could produce unexpected incompatible
|
||||
results.
|
||||
|
||||
``encodings.idna`` Compatibility
|
||||
++++++++++++++++++++++++++++++++
|
||||
|
||||
Function calls from the Python built-in ``encodings.idna`` module are
|
||||
mapped to their IDNA 2008 equivalents using the ``idna.compat`` module.
|
||||
Simply substitute the ``import`` clause in your code to refer to the new
|
||||
module name.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
||||
All errors raised during the conversion following the specification
|
||||
should raise an exception derived from the ``idna.IDNAError`` base
|
||||
class.
|
||||
|
||||
More specific exceptions that may be generated as ``idna.IDNABidiError``
|
||||
when the error reflects an illegal combination of left-to-right and
|
||||
right-to-left characters in a label; ``idna.InvalidCodepoint`` when
|
||||
a specific codepoint is an illegal character in an IDN label (i.e.
|
||||
INVALID); and ``idna.InvalidCodepointContext`` when the codepoint is
|
||||
illegal based on its positional context (i.e. it is CONTEXTO or CONTEXTJ
|
||||
but the contextual requirements are not satisfied.)
|
||||
|
||||
Building and Diagnostics
|
||||
------------------------
|
||||
|
||||
The IDNA and UTS 46 functionality relies upon pre-calculated lookup
|
||||
tables for performance. These tables are derived from computing against
|
||||
eligibility criteria in the respective standards. These tables are
|
||||
computed using the command-line script ``tools/idna-data``.
|
||||
|
||||
This tool will fetch relevant codepoint data from the Unicode repository
|
||||
and perform the required calculations to identify eligibility. There are
|
||||
three main modes:
|
||||
|
||||
* ``idna-data make-libdata``. Generates ``idnadata.py`` and
|
||||
``uts46data.py``, the pre-calculated lookup tables used for IDNA and
|
||||
UTS 46 conversions. Implementers who wish to track this library against
|
||||
a different Unicode version may use this tool to manually generate a
|
||||
different version of the ``idnadata.py`` and ``uts46data.py`` files.
|
||||
|
||||
* ``idna-data make-table``. Generate a table of the IDNA disposition
|
||||
(e.g. PVALID, CONTEXTJ, CONTEXTO) in the format found in Appendix
|
||||
B.1 of RFC 5892 and the pre-computed tables published by `IANA
|
||||
<https://www.iana.org/>`_.
|
||||
|
||||
* ``idna-data U+0061``. Prints debugging output on the various
|
||||
properties associated with an individual Unicode codepoint (in this
|
||||
case, U+0061), that are used to assess the IDNA and UTS 46 status of a
|
||||
codepoint. This is helpful in debugging or analysis.
|
||||
|
||||
The tool accepts a number of arguments, described using ``idna-data
|
||||
-h``. Most notably, the ``--version`` argument allows the specification
|
||||
of the version of Unicode to be used in computing the table data. For
|
||||
example, ``idna-data --version 9.0.0 make-libdata`` will generate
|
||||
library data against Unicode 9.0.0.
|
||||
|
||||
|
||||
Additional Notes
|
||||
----------------
|
||||
|
||||
* **Packages**. The latest tagged release version is published in the
|
||||
`Python Package Index <https://pypi.org/project/idna/>`_.
|
||||
|
||||
* **Version support**. This library supports Python 3.6 and higher.
|
||||
As this library serves as a low-level toolkit for a variety of
|
||||
applications, many of which strive for broad compatibility with older
|
||||
Python versions, there is no rush to remove older interpreter support.
|
||||
Removing support for older versions should be well justified in that the
|
||||
maintenance burden has become too high.
|
||||
|
||||
* **Python 2**. Python 2 is supported by version 2.x of this library.
|
||||
Use "idna<3" in your requirements file if you need this library for
|
||||
a Python 2 application. Be advised that these versions are no longer
|
||||
actively developed.
|
||||
|
||||
* **Testing**. The library has a test suite based on each rule of the
|
||||
IDNA specification, as well as tests that are provided as part of the
|
||||
Unicode Technical Standard 46, `Unicode IDNA Compatibility Processing
|
||||
<https://unicode.org/reports/tr46/>`_.
|
||||
|
||||
* **Emoji**. It is an occasional request to support emoji domains in
|
||||
this library. Encoding of symbols like emoji is expressly prohibited by
|
||||
the technical standard IDNA 2008 and emoji domains are broadly phased
|
||||
out across the domain industry due to associated security risks. For
|
||||
now, applications that need to support these non-compliant labels
|
||||
may wish to consider trying the encode/decode operation in this library
|
||||
first, and then falling back to using `encodings.idna`. See `the Github
|
||||
project <https://github.com/kjd/idna/issues/18>`_ for more discussion.
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
idna-3.10.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
idna-3.10.dist-info/LICENSE.md,sha256=pZ8LDvNjWHQQmkRhykT_enDVBpboFHZ7-vch1Mmw2w8,1541
|
||||
idna-3.10.dist-info/METADATA,sha256=URR5ZyDfQ1PCEGhkYoojqfi2Ra0tau2--lhwG4XSfjI,10158
|
||||
idna-3.10.dist-info/RECORD,,
|
||||
idna-3.10.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
|
||||
idna/__init__.py,sha256=MPqNDLZbXqGaNdXxAFhiqFPKEQXju2jNQhCey6-5eJM,868
|
||||
idna/__pycache__/__init__.cpython-312.pyc,,
|
||||
idna/__pycache__/codec.cpython-312.pyc,,
|
||||
idna/__pycache__/compat.cpython-312.pyc,,
|
||||
idna/__pycache__/core.cpython-312.pyc,,
|
||||
idna/__pycache__/idnadata.cpython-312.pyc,,
|
||||
idna/__pycache__/intranges.cpython-312.pyc,,
|
||||
idna/__pycache__/package_data.cpython-312.pyc,,
|
||||
idna/__pycache__/uts46data.cpython-312.pyc,,
|
||||
idna/codec.py,sha256=PEew3ItwzjW4hymbasnty2N2OXvNcgHB-JjrBuxHPYY,3422
|
||||
idna/compat.py,sha256=RzLy6QQCdl9784aFhb2EX9EKGCJjg0P3PilGdeXXcx8,316
|
||||
idna/core.py,sha256=YJYyAMnwiQEPjVC4-Fqu_p4CJ6yKKuDGmppBNQNQpFs,13239
|
||||
idna/idnadata.py,sha256=W30GcIGvtOWYwAjZj4ZjuouUutC6ffgNuyjJy7fZ-lo,78306
|
||||
idna/intranges.py,sha256=amUtkdhYcQG8Zr-CoMM_kVRacxkivC1WgxN1b63KKdU,1898
|
||||
idna/package_data.py,sha256=q59S3OXsc5VI8j6vSD0sGBMyk6zZ4vWFREE88yCJYKs,21
|
||||
idna/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
idna/uts46data.py,sha256=rt90K9J40gUSwppDPCrhjgi5AA6pWM65dEGRSf6rIhM,239289
|
|
@ -0,0 +1,4 @@
|
|||
Wheel-Version: 1.0
|
||||
Generator: flit 3.9.0
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
45
.venv/lib/python3.12/site-packages/idna/__init__.py
Normal file
45
.venv/lib/python3.12/site-packages/idna/__init__.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
from .core import (
|
||||
IDNABidiError,
|
||||
IDNAError,
|
||||
InvalidCodepoint,
|
||||
InvalidCodepointContext,
|
||||
alabel,
|
||||
check_bidi,
|
||||
check_hyphen_ok,
|
||||
check_initial_combiner,
|
||||
check_label,
|
||||
check_nfc,
|
||||
decode,
|
||||
encode,
|
||||
ulabel,
|
||||
uts46_remap,
|
||||
valid_contextj,
|
||||
valid_contexto,
|
||||
valid_label_length,
|
||||
valid_string_length,
|
||||
)
|
||||
from .intranges import intranges_contain
|
||||
from .package_data import __version__
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"IDNABidiError",
|
||||
"IDNAError",
|
||||
"InvalidCodepoint",
|
||||
"InvalidCodepointContext",
|
||||
"alabel",
|
||||
"check_bidi",
|
||||
"check_hyphen_ok",
|
||||
"check_initial_combiner",
|
||||
"check_label",
|
||||
"check_nfc",
|
||||
"decode",
|
||||
"encode",
|
||||
"intranges_contain",
|
||||
"ulabel",
|
||||
"uts46_remap",
|
||||
"valid_contextj",
|
||||
"valid_contexto",
|
||||
"valid_label_length",
|
||||
"valid_string_length",
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
122
.venv/lib/python3.12/site-packages/idna/codec.py
Normal file
122
.venv/lib/python3.12/site-packages/idna/codec.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
import codecs
|
||||
import re
|
||||
from typing import Any, Optional, Tuple
|
||||
|
||||
from .core import IDNAError, alabel, decode, encode, ulabel
|
||||
|
||||
_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
|
||||
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
def encode(self, data: str, errors: str = "strict") -> Tuple[bytes, int]:
|
||||
if errors != "strict":
|
||||
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return b"", 0
|
||||
|
||||
return encode(data), len(data)
|
||||
|
||||
def decode(self, data: bytes, errors: str = "strict") -> Tuple[str, int]:
|
||||
if errors != "strict":
|
||||
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return "", 0
|
||||
|
||||
return decode(data), len(data)
|
||||
|
||||
|
||||
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
||||
def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[bytes, int]:
|
||||
if errors != "strict":
|
||||
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return b"", 0
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = b""
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = b"."
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = b"."
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(alabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
# Join with U+002E
|
||||
result_bytes = b".".join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return result_bytes, size
|
||||
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def _buffer_decode(self, data: Any, errors: str, final: bool) -> Tuple[str, int]:
|
||||
if errors != "strict":
|
||||
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
||||
|
||||
if not data:
|
||||
return ("", 0)
|
||||
|
||||
if not isinstance(data, str):
|
||||
data = str(data, "ascii")
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = ""
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = "."
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = "."
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(ulabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
result_str = ".".join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return (result_str, size)
|
||||
|
||||
|
||||
class StreamWriter(Codec, codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
|
||||
class StreamReader(Codec, codecs.StreamReader):
|
||||
pass
|
||||
|
||||
|
||||
def search_function(name: str) -> Optional[codecs.CodecInfo]:
|
||||
if name != "idna2008":
|
||||
return None
|
||||
return codecs.CodecInfo(
|
||||
name=name,
|
||||
encode=Codec().encode,
|
||||
decode=Codec().decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
)
|
||||
|
||||
|
||||
codecs.register(search_function)
|
15
.venv/lib/python3.12/site-packages/idna/compat.py
Normal file
15
.venv/lib/python3.12/site-packages/idna/compat.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
from typing import Any, Union
|
||||
|
||||
from .core import decode, encode
|
||||
|
||||
|
||||
def ToASCII(label: str) -> bytes:
|
||||
return encode(label)
|
||||
|
||||
|
||||
def ToUnicode(label: Union[bytes, bytearray]) -> str:
|
||||
return decode(label)
|
||||
|
||||
|
||||
def nameprep(s: Any) -> None:
|
||||
raise NotImplementedError("IDNA 2008 does not utilise nameprep protocol")
|
437
.venv/lib/python3.12/site-packages/idna/core.py
Normal file
437
.venv/lib/python3.12/site-packages/idna/core.py
Normal file
|
@ -0,0 +1,437 @@
|
|||
import bisect
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Optional, Union
|
||||
|
||||
from . import idnadata
|
||||
from .intranges import intranges_contain
|
||||
|
||||
_virama_combining_class = 9
|
||||
_alabel_prefix = b"xn--"
|
||||
_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
|
||||
|
||||
|
||||
class IDNAError(UnicodeError):
|
||||
"""Base exception for all IDNA-encoding related problems"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class IDNABidiError(IDNAError):
|
||||
"""Exception when bidirectional requirements are not satisfied"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepoint(IDNAError):
|
||||
"""Exception when a disallowed or unallocated codepoint is used"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepointContext(IDNAError):
|
||||
"""Exception when the codepoint is not valid in the context it is used"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
def _combining_class(cp: int) -> int:
|
||||
v = unicodedata.combining(chr(cp))
|
||||
if v == 0:
|
||||
if not unicodedata.name(chr(cp)):
|
||||
raise ValueError("Unknown character in unicodedata")
|
||||
return v
|
||||
|
||||
|
||||
def _is_script(cp: str, script: str) -> bool:
|
||||
return intranges_contain(ord(cp), idnadata.scripts[script])
|
||||
|
||||
|
||||
def _punycode(s: str) -> bytes:
|
||||
return s.encode("punycode")
|
||||
|
||||
|
||||
def _unot(s: int) -> str:
|
||||
return "U+{:04X}".format(s)
|
||||
|
||||
|
||||
def valid_label_length(label: Union[bytes, str]) -> bool:
|
||||
if len(label) > 63:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
|
||||
if len(label) > (254 if trailing_dot else 253):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def check_bidi(label: str, check_ltr: bool = False) -> bool:
|
||||
# Bidi rules should only be applied if string contains RTL characters
|
||||
bidi_label = False
|
||||
for idx, cp in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
if direction == "":
|
||||
# String likely comes from a newer version of Unicode
|
||||
raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx))
|
||||
if direction in ["R", "AL", "AN"]:
|
||||
bidi_label = True
|
||||
if not bidi_label and not check_ltr:
|
||||
return True
|
||||
|
||||
# Bidi rule 1
|
||||
direction = unicodedata.bidirectional(label[0])
|
||||
if direction in ["R", "AL"]:
|
||||
rtl = True
|
||||
elif direction == "L":
|
||||
rtl = False
|
||||
else:
|
||||
raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label)))
|
||||
|
||||
valid_ending = False
|
||||
number_type: Optional[str] = None
|
||||
for idx, cp in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
|
||||
if rtl:
|
||||
# Bidi rule 2
|
||||
if direction not in [
|
||||
"R",
|
||||
"AL",
|
||||
"AN",
|
||||
"EN",
|
||||
"ES",
|
||||
"CS",
|
||||
"ET",
|
||||
"ON",
|
||||
"BN",
|
||||
"NSM",
|
||||
]:
|
||||
raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx))
|
||||
# Bidi rule 3
|
||||
if direction in ["R", "AL", "EN", "AN"]:
|
||||
valid_ending = True
|
||||
elif direction != "NSM":
|
||||
valid_ending = False
|
||||
# Bidi rule 4
|
||||
if direction in ["AN", "EN"]:
|
||||
if not number_type:
|
||||
number_type = direction
|
||||
else:
|
||||
if number_type != direction:
|
||||
raise IDNABidiError("Can not mix numeral types in a right-to-left label")
|
||||
else:
|
||||
# Bidi rule 5
|
||||
if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]:
|
||||
raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx))
|
||||
# Bidi rule 6
|
||||
if direction in ["L", "EN"]:
|
||||
valid_ending = True
|
||||
elif direction != "NSM":
|
||||
valid_ending = False
|
||||
|
||||
if not valid_ending:
|
||||
raise IDNABidiError("Label ends with illegal codepoint directionality")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def check_initial_combiner(label: str) -> bool:
|
||||
if unicodedata.category(label[0])[0] == "M":
|
||||
raise IDNAError("Label begins with an illegal combining character")
|
||||
return True
|
||||
|
||||
|
||||
def check_hyphen_ok(label: str) -> bool:
|
||||
if label[2:4] == "--":
|
||||
raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")
|
||||
if label[0] == "-" or label[-1] == "-":
|
||||
raise IDNAError("Label must not start or end with a hyphen")
|
||||
return True
|
||||
|
||||
|
||||
def check_nfc(label: str) -> None:
|
||||
if unicodedata.normalize("NFC", label) != label:
|
||||
raise IDNAError("Label must be in Normalization Form C")
|
||||
|
||||
|
||||
def valid_contextj(label: str, pos: int) -> bool:
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x200C:
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
|
||||
ok = False
|
||||
for i in range(pos - 1, -1, -1):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord("T"):
|
||||
continue
|
||||
elif joining_type in [ord("L"), ord("D")]:
|
||||
ok = True
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
if not ok:
|
||||
return False
|
||||
|
||||
ok = False
|
||||
for i in range(pos + 1, len(label)):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord("T"):
|
||||
continue
|
||||
elif joining_type in [ord("R"), ord("D")]:
|
||||
ok = True
|
||||
break
|
||||
else:
|
||||
break
|
||||
return ok
|
||||
|
||||
if cp_value == 0x200D:
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
return False
|
||||
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x00B7:
|
||||
if 0 < pos < len(label) - 1:
|
||||
if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C:
|
||||
return True
|
||||
return False
|
||||
|
||||
elif cp_value == 0x0375:
|
||||
if pos < len(label) - 1 and len(label) > 1:
|
||||
return _is_script(label[pos + 1], "Greek")
|
||||
return False
|
||||
|
||||
elif cp_value == 0x05F3 or cp_value == 0x05F4:
|
||||
if pos > 0:
|
||||
return _is_script(label[pos - 1], "Hebrew")
|
||||
return False
|
||||
|
||||
elif cp_value == 0x30FB:
|
||||
for cp in label:
|
||||
if cp == "\u30fb":
|
||||
continue
|
||||
if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):
|
||||
return True
|
||||
return False
|
||||
|
||||
elif 0x660 <= cp_value <= 0x669:
|
||||
for cp in label:
|
||||
if 0x6F0 <= ord(cp) <= 0x06F9:
|
||||
return False
|
||||
return True
|
||||
|
||||
elif 0x6F0 <= cp_value <= 0x6F9:
|
||||
for cp in label:
|
||||
if 0x660 <= ord(cp) <= 0x0669:
|
||||
return False
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def check_label(label: Union[str, bytes, bytearray]) -> None:
|
||||
if isinstance(label, (bytes, bytearray)):
|
||||
label = label.decode("utf-8")
|
||||
if len(label) == 0:
|
||||
raise IDNAError("Empty Label")
|
||||
|
||||
check_nfc(label)
|
||||
check_hyphen_ok(label)
|
||||
check_initial_combiner(label)
|
||||
|
||||
for pos, cp in enumerate(label):
|
||||
cp_value = ord(cp)
|
||||
if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):
|
||||
continue
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):
|
||||
try:
|
||||
if not valid_contextj(label, pos):
|
||||
raise InvalidCodepointContext(
|
||||
"Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
|
||||
)
|
||||
except ValueError:
|
||||
raise IDNAError(
|
||||
"Unknown codepoint adjacent to joiner {} at position {} in {}".format(
|
||||
_unot(cp_value), pos + 1, repr(label)
|
||||
)
|
||||
)
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):
|
||||
if not valid_contexto(label, pos):
|
||||
raise InvalidCodepointContext(
|
||||
"Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
|
||||
)
|
||||
else:
|
||||
raise InvalidCodepoint(
|
||||
"Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label))
|
||||
)
|
||||
|
||||
check_bidi(label)
|
||||
|
||||
|
||||
def alabel(label: str) -> bytes:
|
||||
try:
|
||||
label_bytes = label.encode("ascii")
|
||||
ulabel(label_bytes)
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError("Label too long")
|
||||
return label_bytes
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
|
||||
check_label(label)
|
||||
label_bytes = _alabel_prefix + _punycode(label)
|
||||
|
||||
if not valid_label_length(label_bytes):
|
||||
raise IDNAError("Label too long")
|
||||
|
||||
return label_bytes
|
||||
|
||||
|
||||
def ulabel(label: Union[str, bytes, bytearray]) -> str:
|
||||
if not isinstance(label, (bytes, bytearray)):
|
||||
try:
|
||||
label_bytes = label.encode("ascii")
|
||||
except UnicodeEncodeError:
|
||||
check_label(label)
|
||||
return label
|
||||
else:
|
||||
label_bytes = label
|
||||
|
||||
label_bytes = label_bytes.lower()
|
||||
if label_bytes.startswith(_alabel_prefix):
|
||||
label_bytes = label_bytes[len(_alabel_prefix) :]
|
||||
if not label_bytes:
|
||||
raise IDNAError("Malformed A-label, no Punycode eligible content found")
|
||||
if label_bytes.decode("ascii")[-1] == "-":
|
||||
raise IDNAError("A-label must not end with a hyphen")
|
||||
else:
|
||||
check_label(label_bytes)
|
||||
return label_bytes.decode("ascii")
|
||||
|
||||
try:
|
||||
label = label_bytes.decode("punycode")
|
||||
except UnicodeError:
|
||||
raise IDNAError("Invalid A-label")
|
||||
check_label(label)
|
||||
return label
|
||||
|
||||
|
||||
def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
|
||||
"""Re-map the characters in the string according to UTS46 processing."""
|
||||
from .uts46data import uts46data
|
||||
|
||||
output = ""
|
||||
|
||||
for pos, char in enumerate(domain):
|
||||
code_point = ord(char)
|
||||
try:
|
||||
uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
|
||||
status = uts46row[1]
|
||||
replacement: Optional[str] = None
|
||||
if len(uts46row) == 3:
|
||||
replacement = uts46row[2]
|
||||
if (
|
||||
status == "V"
|
||||
or (status == "D" and not transitional)
|
||||
or (status == "3" and not std3_rules and replacement is None)
|
||||
):
|
||||
output += char
|
||||
elif replacement is not None and (
|
||||
status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)
|
||||
):
|
||||
output += replacement
|
||||
elif status != "I":
|
||||
raise IndexError()
|
||||
except IndexError:
|
||||
raise InvalidCodepoint(
|
||||
"Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain))
|
||||
)
|
||||
|
||||
return unicodedata.normalize("NFC", output)
|
||||
|
||||
|
||||
def encode(
|
||||
s: Union[str, bytes, bytearray],
|
||||
strict: bool = False,
|
||||
uts46: bool = False,
|
||||
std3_rules: bool = False,
|
||||
transitional: bool = False,
|
||||
) -> bytes:
|
||||
if not isinstance(s, str):
|
||||
try:
|
||||
s = str(s, "ascii")
|
||||
except UnicodeDecodeError:
|
||||
raise IDNAError("should pass a unicode string to the function rather than a byte string.")
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, transitional)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if strict:
|
||||
labels = s.split(".")
|
||||
else:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
if not labels or labels == [""]:
|
||||
raise IDNAError("Empty domain")
|
||||
if labels[-1] == "":
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = alabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError("Empty label")
|
||||
if trailing_dot:
|
||||
result.append(b"")
|
||||
s = b".".join(result)
|
||||
if not valid_string_length(s, trailing_dot):
|
||||
raise IDNAError("Domain too long")
|
||||
return s
|
||||
|
||||
|
||||
def decode(
|
||||
s: Union[str, bytes, bytearray],
|
||||
strict: bool = False,
|
||||
uts46: bool = False,
|
||||
std3_rules: bool = False,
|
||||
) -> str:
|
||||
try:
|
||||
if not isinstance(s, str):
|
||||
s = str(s, "ascii")
|
||||
except UnicodeDecodeError:
|
||||
raise IDNAError("Invalid ASCII in A-label")
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, False)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if not strict:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
else:
|
||||
labels = s.split(".")
|
||||
if not labels or labels == [""]:
|
||||
raise IDNAError("Empty domain")
|
||||
if not labels[-1]:
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = ulabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError("Empty label")
|
||||
if trailing_dot:
|
||||
result.append("")
|
||||
return ".".join(result)
|
4243
.venv/lib/python3.12/site-packages/idna/idnadata.py
Normal file
4243
.venv/lib/python3.12/site-packages/idna/idnadata.py
Normal file
File diff suppressed because it is too large
Load diff
57
.venv/lib/python3.12/site-packages/idna/intranges.py
Normal file
57
.venv/lib/python3.12/site-packages/idna/intranges.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
"""
|
||||
Given a list of integers, made up of (hopefully) a small number of long runs
|
||||
of consecutive integers, compute a representation of the form
|
||||
((start1, end1), (start2, end2) ...). Then answer the question "was x present
|
||||
in the original list?" in time O(log(# runs)).
|
||||
"""
|
||||
|
||||
import bisect
|
||||
from typing import List, Tuple
|
||||
|
||||
|
||||
def intranges_from_list(list_: List[int]) -> Tuple[int, ...]:
|
||||
"""Represent a list of integers as a sequence of ranges:
|
||||
((start_0, end_0), (start_1, end_1), ...), such that the original
|
||||
integers are exactly those x such that start_i <= x < end_i for some i.
|
||||
|
||||
Ranges are encoded as single integers (start << 32 | end), not as tuples.
|
||||
"""
|
||||
|
||||
sorted_list = sorted(list_)
|
||||
ranges = []
|
||||
last_write = -1
|
||||
for i in range(len(sorted_list)):
|
||||
if i + 1 < len(sorted_list):
|
||||
if sorted_list[i] == sorted_list[i + 1] - 1:
|
||||
continue
|
||||
current_range = sorted_list[last_write + 1 : i + 1]
|
||||
ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
|
||||
last_write = i
|
||||
|
||||
return tuple(ranges)
|
||||
|
||||
|
||||
def _encode_range(start: int, end: int) -> int:
|
||||
return (start << 32) | end
|
||||
|
||||
|
||||
def _decode_range(r: int) -> Tuple[int, int]:
|
||||
return (r >> 32), (r & ((1 << 32) - 1))
|
||||
|
||||
|
||||
def intranges_contain(int_: int, ranges: Tuple[int, ...]) -> bool:
|
||||
"""Determine if `int_` falls into one of the ranges in `ranges`."""
|
||||
tuple_ = _encode_range(int_, 0)
|
||||
pos = bisect.bisect_left(ranges, tuple_)
|
||||
# we could be immediately ahead of a tuple (start, end)
|
||||
# with start < int_ <= end
|
||||
if pos > 0:
|
||||
left, right = _decode_range(ranges[pos - 1])
|
||||
if left <= int_ < right:
|
||||
return True
|
||||
# or we could be immediately behind a tuple (int_, end)
|
||||
if pos < len(ranges):
|
||||
left, _ = _decode_range(ranges[pos])
|
||||
if left == int_:
|
||||
return True
|
||||
return False
|
1
.venv/lib/python3.12/site-packages/idna/package_data.py
Normal file
1
.venv/lib/python3.12/site-packages/idna/package_data.py
Normal file
|
@ -0,0 +1 @@
|
|||
__version__ = "3.10"
|
0
.venv/lib/python3.12/site-packages/idna/py.typed
Normal file
0
.venv/lib/python3.12/site-packages/idna/py.typed
Normal file
8681
.venv/lib/python3.12/site-packages/idna/uts46data.py
Normal file
8681
.venv/lib/python3.12/site-packages/idna/uts46data.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1 @@
|
|||
pip
|
|
@ -0,0 +1,123 @@
|
|||
Metadata-Version: 2.4
|
||||
Name: internetarchive
|
||||
Version: 5.4.0
|
||||
Summary: A Python interface to archive.org.
|
||||
Home-page: https://github.com/jjjake/internetarchive
|
||||
Author: Jacob M. Johnson
|
||||
Author-email: jake@archive.org
|
||||
License: AGPL-3.0
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Requires-Python: >=3.9
|
||||
Description-Content-Type: text/x-rst
|
||||
License-File: LICENSE
|
||||
Requires-Dist: jsonpatch>=0.4
|
||||
Requires-Dist: requests<3.0.0,>=2.25.0
|
||||
Requires-Dist: tqdm>=4.0.0
|
||||
Requires-Dist: urllib3>=1.26.0
|
||||
Requires-Dist: importlib-metadata>=3.6.0; python_version <= "3.10"
|
||||
Provides-Extra: all
|
||||
Requires-Dist: black; extra == "all"
|
||||
Requires-Dist: mypy; extra == "all"
|
||||
Requires-Dist: pre-commit; extra == "all"
|
||||
Requires-Dist: pytest; extra == "all"
|
||||
Requires-Dist: safety; extra == "all"
|
||||
Requires-Dist: setuptools; extra == "all"
|
||||
Requires-Dist: pytest==7.1.2; extra == "all"
|
||||
Requires-Dist: responses==0.20.0; extra == "all"
|
||||
Requires-Dist: ruff==0.8.5; extra == "all"
|
||||
Requires-Dist: tqdm-stubs>=0.2.0; extra == "all"
|
||||
Requires-Dist: types-colorama; extra == "all"
|
||||
Requires-Dist: types-jsonpatch>=0.1.0a0; extra == "all"
|
||||
Requires-Dist: types-pygments; extra == "all"
|
||||
Requires-Dist: types-requests<3.0.0,>=2.25.0; extra == "all"
|
||||
Requires-Dist: types-setuptools; extra == "all"
|
||||
Requires-Dist: types-ujson>=4.2.0; extra == "all"
|
||||
Requires-Dist: types-urllib3>=1.26.0; extra == "all"
|
||||
Provides-Extra: dev
|
||||
Requires-Dist: black; extra == "dev"
|
||||
Requires-Dist: mypy; extra == "dev"
|
||||
Requires-Dist: pre-commit; extra == "dev"
|
||||
Requires-Dist: pytest; extra == "dev"
|
||||
Requires-Dist: safety; extra == "dev"
|
||||
Requires-Dist: setuptools; extra == "dev"
|
||||
Provides-Extra: docs
|
||||
Requires-Dist: alabaster==0.7.12; extra == "docs"
|
||||
Requires-Dist: docutils<0.18; extra == "docs"
|
||||
Requires-Dist: sphinx==4.5.0; extra == "docs"
|
||||
Requires-Dist: sphinx-autodoc-typehints==1.18.1; extra == "docs"
|
||||
Provides-Extra: test
|
||||
Requires-Dist: pytest==7.1.2; extra == "test"
|
||||
Requires-Dist: responses==0.20.0; extra == "test"
|
||||
Requires-Dist: ruff==0.8.5; extra == "test"
|
||||
Provides-Extra: types
|
||||
Requires-Dist: tqdm-stubs>=0.2.0; extra == "types"
|
||||
Requires-Dist: types-colorama; extra == "types"
|
||||
Requires-Dist: types-jsonpatch>=0.1.0a0; extra == "types"
|
||||
Requires-Dist: types-pygments; extra == "types"
|
||||
Requires-Dist: types-requests<3.0.0,>=2.25.0; extra == "types"
|
||||
Requires-Dist: types-setuptools; extra == "types"
|
||||
Requires-Dist: types-ujson>=4.2.0; extra == "types"
|
||||
Requires-Dist: types-urllib3>=1.26.0; extra == "types"
|
||||
Dynamic: license-file
|
||||
|
||||
A Python and Command-Line Interface to Archive.org
|
||||
==================================================
|
||||
|
||||
|tox|
|
||||
|versions|
|
||||
|downloads|
|
||||
|contributors|
|
||||
|
||||
.. |tox| image:: https://github.com/jjjake/internetarchive/actions/workflows/tox.yml/badge.svg
|
||||
:target: https://github.com/jjjake/internetarchive/actions/workflows/tox.yml
|
||||
|
||||
.. |versions| image:: https://img.shields.io/pypi/pyversions/internetarchive.svg
|
||||
:target: https://pypi.org/project/internetarchive
|
||||
|
||||
.. |downloads| image:: https://static.pepy.tech/badge/internetarchive/month
|
||||
:target: https://pepy.tech/project/internetarchive
|
||||
|
||||
.. |contributors| image:: https://img.shields.io/github/contributors/jjjake/internetarchive.svg
|
||||
:target: https://github.com/jjjake/internetarchive/graphs/contributors
|
||||
|
||||
This package installs a command-line tool named ``ia`` for using Archive.org from the command-line.
|
||||
It also installs the ``internetarchive`` Python module for programmatic access to archive.org.
|
||||
Please report all bugs and issues on `Github <https://github.com/jjjake/internetarchive/issues>`__.
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
You can install this module via `pipx <https://pipx.pypa.io/stable/>`_:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pipx install internetarchive
|
||||
|
||||
Binaries of the command-line tool are also available:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ curl -LO https://archive.org/download/ia-pex/ia
|
||||
$ chmod +x ia
|
||||
$ ./ia --help
|
||||
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
Documentation is available at `https://archive.org/services/docs/api/internetarchive <https://archive.org/services/docs/api/internetarchive>`_.
|
||||
|
||||
|
||||
Contributing
|
||||
------------
|
||||
|
||||
All contributions are welcome and appreciated. Please see `https://archive.org/services/docs/api/internetarchive/contributing.html <https://archive.org/services/docs/api/internetarchive/contributing.html>`_ for more details.
|
|
@ -0,0 +1,73 @@
|
|||
../../../bin/ia,sha256=MounJQ_gy2zieVWEXKl6nzDSZJ2l5L66z6_L0gHBIUc,257
|
||||
internetarchive-5.4.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
internetarchive-5.4.0.dist-info/METADATA,sha256=Hjly199gLoyXF2G-rbI7rqPCGXMBDYMYJbGYQSr-awQ,4725
|
||||
internetarchive-5.4.0.dist-info/RECORD,,
|
||||
internetarchive-5.4.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
internetarchive-5.4.0.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
||||
internetarchive-5.4.0.dist-info/entry_points.txt,sha256=4MR8dGVeOEZTxj5Au9qyTmM2rpW0e8i7AUWg3MOjPzU,51
|
||||
internetarchive-5.4.0.dist-info/licenses/LICENSE,sha256=V8j_M8nAz8PvAOZQocyRDX7keai8UJ9skgmnwqETmdY,34520
|
||||
internetarchive-5.4.0.dist-info/top_level.txt,sha256=oeKeiWXN3uZRW-PPVRJS-CmdIEpgRtvF41xirMdX3n4,16
|
||||
internetarchive/__init__.py,sha256=5maH8vj-hAVVmpCYv3QQtJyvwBbZiVex6MPQZF5MyUY,2235
|
||||
internetarchive/__pycache__/__init__.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/__version__.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/account.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/api.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/auth.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/catalog.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/config.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/exceptions.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/files.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/iarequest.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/item.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/search.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/session.cpython-312.pyc,,
|
||||
internetarchive/__pycache__/utils.cpython-312.pyc,,
|
||||
internetarchive/__version__.py,sha256=sktW1--ZVshMvDSLP9Be9gNwi9UrCVuR8SYM25If68g,22
|
||||
internetarchive/account.py,sha256=fvt-bZ6C7l0YhBefFnTu9mu_AscCHqMoQ3tZ3ssbTJU,8918
|
||||
internetarchive/api.py,sha256=uoRV_RWxiW_wGkS6xzzcb_rrTvb_xyC-8gPCAgSE_fw,20356
|
||||
internetarchive/auth.py,sha256=nKffajQCXkua5aFKhT5cHSP_aMdfPNTwL7rYC2BCRxo,2641
|
||||
internetarchive/catalog.py,sha256=jLHa9rEtwbn5J5VIqxv1jIGuT57NCjO4WHVI2FqlQPQ,11922
|
||||
internetarchive/cli/__init__.py,sha256=OM-oGfN7G37bPb7v2UB4vrv_gnOzTQTaDi0nq6Qzjrs,1409
|
||||
internetarchive/cli/__pycache__/__init__.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/cli_utils.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_account.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_configure.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_copy.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_delete.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_download.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_flag.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_list.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_metadata.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_move.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_reviews.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_search.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_simplelists.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_tasks.cpython-312.pyc,,
|
||||
internetarchive/cli/__pycache__/ia_upload.cpython-312.pyc,,
|
||||
internetarchive/cli/cli_utils.py,sha256=r86ew6XPz4VCBaPP51WHE8oW7kuAhq_tZz1BiOsKj8I,7381
|
||||
internetarchive/cli/ia.py,sha256=-E6DLu6IY2cs82K-4GmBb7EekcPNzTVuzmVC59bbLis,4751
|
||||
internetarchive/cli/ia_account.py,sha256=RZtvAxnzaMepSxFhQsVHRUIOktt7POjDvBhKGAucRKE,4140
|
||||
internetarchive/cli/ia_configure.py,sha256=HzqULYwLb36u1yU4UC0MJXA8Brl6BbzPZpgWZiXZvzc,7466
|
||||
internetarchive/cli/ia_copy.py,sha256=fFWPMQbgmpsR-dv1FgImG3gzvmCj5J5phXItUNBfI0s,6458
|
||||
internetarchive/cli/ia_delete.py,sha256=xghMPNOSueS4qfhH5BudbVTqVfUC9NcHymM33FssHZk,6502
|
||||
internetarchive/cli/ia_download.py,sha256=K3S2DKowtt6GZCs0CXYxlrhuvxxxeUsZwcgfL_3A0OM,9770
|
||||
internetarchive/cli/ia_flag.py,sha256=taESC-8OU4giqR8OV4RpiSESaIZ9DasKx1hv0R5oZdI,3038
|
||||
internetarchive/cli/ia_list.py,sha256=7R5CbonTiAzW41bRS3tz-vGY1r22UbVBRJC2h-qul74,4816
|
||||
internetarchive/cli/ia_metadata.py,sha256=-GlDYFe7jCk-0gqkFTtAQm6vDyhZ-SONorbLzNgafi8,12631
|
||||
internetarchive/cli/ia_move.py,sha256=IPRYXOYHDI2nBic-VMOw0X_lifixrnXYSpSJ-ElXe6E,3808
|
||||
internetarchive/cli/ia_reviews.py,sha256=3VKCzVFhSEc80aEjTjPWxHQzJ4yKXaW_6pv5ti_ekN4,5195
|
||||
internetarchive/cli/ia_search.py,sha256=UnqKdXmr0LpZmzX_cyX-DhgpnXYZTvIq87-Vxu9qaJE,6565
|
||||
internetarchive/cli/ia_simplelists.py,sha256=OZNfMeTSahhpoIrLk_13K2x0AIblVjDIVH1HqqHU-N0,4426
|
||||
internetarchive/cli/ia_tasks.py,sha256=LENoaB7XSJT5lmpBXetMoLjfVTVoe-UBF1HnbXZZXZE,6795
|
||||
internetarchive/cli/ia_upload.py,sha256=esL_kC6H50SDyCz9-aYlyJ1CxKH-DxD8Rgj95BEQx5s,14471
|
||||
internetarchive/cli/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
internetarchive/config.py,sha256=UoJhVweler_k1SzcjF-U6G73fxw0dN_-STz90BDuILc,6763
|
||||
internetarchive/exceptions.py,sha256=vQXyz0T92x1IWqm-chSD8f6iFkSWkWJu387Vu5oopEA,1858
|
||||
internetarchive/files.py,sha256=ZHxStNHDgSZ6TkKIoUSURMMX06tMrkS3R8vuLvXfd0s,19605
|
||||
internetarchive/iarequest.py,sha256=1RwRIRZKEr6QCk4gEB5-q7Vhtve0JmSSpY9VDPzp_Q0,16971
|
||||
internetarchive/item.py,sha256=xKPWwYhVh4lEQ-5h_qGFaL0Y4lxzu2RPXgPQZTyWNOU,58314
|
||||
internetarchive/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
internetarchive/search.py,sha256=XmFAw73UTwzZLkBBftz3NBfJ04if7qY6XVyU3odUzEM,10585
|
||||
internetarchive/session.py,sha256=39DB1Jx3N505y87rXAefQTTmwa_J2EQDK6tatWvgiDY,23462
|
||||
internetarchive/utils.py,sha256=DiZbGvTEWtRZPQ6nULyOa_555JmAbRzpjlIjmeR27pQ,14678
|
|
@ -0,0 +1,5 @@
|
|||
Wheel-Version: 1.0
|
||||
Generator: setuptools (80.0.0)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
[console_scripts]
|
||||
ia = internetarchive.cli.ia:main
|
|
@ -0,0 +1,661 @@
|
|||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<http://www.gnu.org/licenses/>.
|
|
@ -0,0 +1 @@
|
|||
internetarchive
|
|
@ -0,0 +1,89 @@
|
|||
#
|
||||
# The internetarchive module is a Python/CLI interface to Archive.org.
|
||||
#
|
||||
# Copyright (C) 2012-2024 Internet Archive
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as
|
||||
# published by the Free Software Foundation, either version 3 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Internetarchive Library
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Internetarchive is a python interface to archive.org.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> from internetarchive import get_item
|
||||
>>> item = get_item('govlawgacode20071')
|
||||
>>> item.exists
|
||||
True
|
||||
|
||||
:copyright: (C) 2012-2024 by Internet Archive.
|
||||
:license: AGPL 3, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
__title__ = 'internetarchive'
|
||||
__author__ = 'Jacob M. Johnson'
|
||||
__license__ = 'AGPL 3'
|
||||
__copyright__ = 'Copyright (C) 2012-2024 Internet Archive'
|
||||
|
||||
from .__version__ import __version__ # isort:skip
|
||||
from internetarchive.api import (
|
||||
configure,
|
||||
delete,
|
||||
download,
|
||||
get_files,
|
||||
get_item,
|
||||
get_session,
|
||||
get_tasks,
|
||||
get_user_info,
|
||||
get_username,
|
||||
modify_metadata,
|
||||
search_items,
|
||||
upload,
|
||||
)
|
||||
from internetarchive.catalog import Catalog
|
||||
from internetarchive.files import File
|
||||
from internetarchive.item import Item
|
||||
from internetarchive.search import Search
|
||||
from internetarchive.session import ArchiveSession
|
||||
|
||||
__all__ = [
|
||||
# Classes.
|
||||
'ArchiveSession',
|
||||
'Catalog',
|
||||
'File',
|
||||
'Item',
|
||||
'Search',
|
||||
'__version__',
|
||||
'configure',
|
||||
'delete',
|
||||
'download',
|
||||
'get_files',
|
||||
# API.
|
||||
'get_item',
|
||||
'get_session',
|
||||
'get_tasks',
|
||||
'get_username',
|
||||
'modify_metadata',
|
||||
'search_items',
|
||||
'upload',
|
||||
]
|
||||
|
||||
|
||||
# Set default logging handler to avoid "No handler found" warnings.
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
log.addHandler(logging.NullHandler())
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue