initial
commit
c68c0e138f
@ -0,0 +1,3 @@
|
||||
thumbnails/
|
||||
images/
|
||||
messages.html
|
@ -0,0 +1,20 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2018 Oliver Steele
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@ -0,0 +1,23 @@
|
||||
[[source]]
|
||||
name = "pypi"
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
|
||||
[requires]
|
||||
python_full_version = "3.6.5"
|
||||
|
||||
[scripts]
|
||||
import = "python import_messages.py"
|
||||
export = "python export_messages.py"
|
||||
list = "python list_rooms.py"
|
||||
|
||||
[packages]
|
||||
matrix_client = "*"
|
||||
mongoengine = "*"
|
||||
click = "*"
|
||||
tabulate = "*"
|
||||
pyyaml = "*"
|
||||
"jinja2" = "*"
|
||||
requests = "*"
|
||||
|
||||
[dev-packages]
|
@ -0,0 +1,210 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "cc3f8517f705a63c6a297c6c461a500fb75e28dc81050219498b261f1ca157b1"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_full_version": "3.6.5"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:5ad7e9a056d25ffa5082862e36f119f7f7cec6457fa07ee2f8c339814b80c9b1",
|
||||
"sha256:9cd41137dc19af6a5e03b630eefe7d1f458d964d406342dd3edf625839b944cc"
|
||||
],
|
||||
"version": "==2020.4.5.2"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
|
||||
"sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==7.1.2"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
|
||||
"sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
|
||||
],
|
||||
"version": "==2.9"
|
||||
},
|
||||
"jinja2": {
|
||||
"hashes": [
|
||||
"sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0",
|
||||
"sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.11.2"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
|
||||
"sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
|
||||
"sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
|
||||
"sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
|
||||
"sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
|
||||
"sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
|
||||
"sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
|
||||
"sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
|
||||
"sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
|
||||
"sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
|
||||
"sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
|
||||
"sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
|
||||
"sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
|
||||
"sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
|
||||
"sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
|
||||
"sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
|
||||
"sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
|
||||
"sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
|
||||
"sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
|
||||
"sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
|
||||
"sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
|
||||
"sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
|
||||
"sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
|
||||
"sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
|
||||
"sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
|
||||
"sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
|
||||
"sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
|
||||
"sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
|
||||
"sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
|
||||
"sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
|
||||
"sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
|
||||
"sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
|
||||
"sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"matrix-client": {
|
||||
"hashes": [
|
||||
"sha256:2855a2614a177db66f9bc3ba38cbd2876041456f663c334f72a160ab6bb11c49",
|
||||
"sha256:dce3ccb8665df0d519f08e07a16e6d3f9fab3a947df4b7a7c4bb26573d68f2d5"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.3.2"
|
||||
},
|
||||
"mongoengine": {
|
||||
"hashes": [
|
||||
"sha256:6e127f45f71c2bc5e72461ec297a0c20f04c3ee0bf6dd869e336226e325db6ef",
|
||||
"sha256:db9e5d587e5d74e52851e0e4a53fd744725bfa9918ae6070139f5ba9c62c6edf"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.20.0"
|
||||
},
|
||||
"pymongo": {
|
||||
"hashes": [
|
||||
"sha256:01b4e10027aef5bb9ecefbc26f5df3368ce34aef81df43850f701e716e3fe16d",
|
||||
"sha256:0fc5aa1b1acf7f61af46fe0414e6a4d0c234b339db4c03a63da48599acf1cbfc",
|
||||
"sha256:1396eb7151e0558b1f817e4b9d7697d5599e5c40d839a9f7270bd90af994ad82",
|
||||
"sha256:18e84a3ec5e73adcb4187b8e5541b2ad61d716026ed9863267e650300d8bea33",
|
||||
"sha256:19adf2848b80cb349b9891cc854581bbf24c338be9a3260e73159bdeb2264464",
|
||||
"sha256:20ee0475aa2ba437b0a14806f125d696f90a8433d820fb558fdd6f052acde103",
|
||||
"sha256:26798795097bdeb571f13942beef7e0b60125397811c75b7aa9214d89880dd1d",
|
||||
"sha256:26e707a4eb851ec27bb969b5f1413b9b2eac28fe34271fa72329100317ea7c73",
|
||||
"sha256:2a3c7ad01553b27ec553688a1e6445e7f40355fb37d925c11fcb50b504e367f8",
|
||||
"sha256:2f07b27dbf303ea53f4147a7922ce91a26b34a0011131471d8aaf73151fdee9a",
|
||||
"sha256:316f0cf543013d0c085e15a2c8abe0db70f93c9722c0f99b6f3318ff69477d70",
|
||||
"sha256:31d11a600eea0c60de22c8bdcb58cda63c762891facdcb74248c36713240987f",
|
||||
"sha256:334ef3ffd0df87ea83a0054454336159f8ad9c1b389e19c0032d9cb8410660e6",
|
||||
"sha256:358ba4693c01022d507b96a980ded855a32dbdccc3c9331d0667be5e967f30ed",
|
||||
"sha256:3a6568bc53103df260f5c7d2da36dffc5202b9a36c85540bba1836a774943794",
|
||||
"sha256:444bf2f44264578c4085bb04493bfed0e5c1b4fe7c2704504d769f955cc78fe4",
|
||||
"sha256:47a00b22c52ee59dffc2aad02d0bbfb20c26ec5b8de8900492bf13ad6901cf35",
|
||||
"sha256:4c067db43b331fc709080d441cb2e157114fec60749667d12186cc3fc8e7a951",
|
||||
"sha256:4c092310f804a5d45a1bcaa4191d6d016c457b6ed3982a622c35f729ff1c7f6b",
|
||||
"sha256:53b711b33134e292ef8499835a3df10909c58df53a2a0308f598c432e9a62892",
|
||||
"sha256:568d6bee70652d8a5af1cd3eec48b4ca1696fb1773b80719ebbd2925b72cb8f6",
|
||||
"sha256:56fa55032782b7f8e0bf6956420d11e2d4e9860598dfe9c504edec53af0fc372",
|
||||
"sha256:5a2c492680c61b440272341294172fa3b3751797b1ab983533a770e4fb0a67ac",
|
||||
"sha256:61235cc39b5b2f593086d1d38f3fc130b2d125bd8fc8621d35bc5b6bdeb92bd2",
|
||||
"sha256:619ac9aaf681434b4d4718d1b31aa2f0fce64f2b3f8435688fcbdc0c818b6c54",
|
||||
"sha256:6238ac1f483494011abde5286282afdfacd8926659e222ba9b74c67008d3a58c",
|
||||
"sha256:63752a72ca4d4e1386278bd43d14232f51718b409e7ac86bcf8810826b531113",
|
||||
"sha256:6fdc5ccb43864065d40dd838437952e9e3da9821b7eac605ba46ada77f846bdf",
|
||||
"sha256:7abc3a6825a346fa4621a6f63e3b662bbb9e0f6ffc32d30a459d695f20fb1a8b",
|
||||
"sha256:7aef381bb9ae8a3821abd7f9d4d93978dbd99072b48522e181baeffcd95b56ae",
|
||||
"sha256:80df3caf251fe61a3f0c9614adc6e2bfcffd1cd3345280896766712fb4b4d6d7",
|
||||
"sha256:95f970f34b59987dee6f360d2e7d30e181d58957b85dff929eee4423739bd151",
|
||||
"sha256:993257f6ca3cde55332af1f62af3e04ca89ce63c08b56a387cdd46136c72f2fa",
|
||||
"sha256:9c0a57390549affc2b5dda24a38de03a5c7cbc58750cd161ff5d106c3c6eec80",
|
||||
"sha256:a0794e987d55d2f719cc95fcf980fc62d12b80e287e6a761c4be14c60bd9fecc",
|
||||
"sha256:a3b98121e68bf370dd8ea09df67e916f93ea95b52fc010902312168c4d1aff5d",
|
||||
"sha256:a60756d55f0887023b3899e6c2923ba5f0042fb11b1d17810b4e07395404f33e",
|
||||
"sha256:a676bd2fbc2309092b9bbb0083d35718b5420af3a42135ebb1e4c3633f56604d",
|
||||
"sha256:a732838c78554c1257ff2492f5c8c4c7312d0aecd7f732149e255f3749edd5ee",
|
||||
"sha256:ae65d65fde4135ef423a2608587c9ef585a3551fc2e4e431e7c7e527047581be",
|
||||
"sha256:b070a4f064a9edb70f921bfdc270725cff7a78c22036dd37a767c51393fb956f",
|
||||
"sha256:b6da85949aa91e9f8c521681344bd2e163de894a5492337fba8b05c409225a4f",
|
||||
"sha256:bbf47110765b2a999803a7de457567389253f8670f7daafb98e059c899ce9764",
|
||||
"sha256:c06b3f998d2d7160db58db69adfb807d2ec307e883e2f17f6b87a1ef6c723f11",
|
||||
"sha256:c318fb70542be16d3d4063cde6010b1e4d328993a793529c15a619251f517c39",
|
||||
"sha256:c4aef42e5fa4c9d5a99f751fb79caa880dac7eaf8a65121549318b984676a1b7",
|
||||
"sha256:c9ca545e93a9c2a3bdaa2e6e21f7a43267ff0813e8055adf2b591c13164c0c57",
|
||||
"sha256:da2c3220eb55c4239dd8b982e213da0b79023cac59fe54ca09365f2bc7e4ad32",
|
||||
"sha256:dd8055da300535eefd446b30995c0813cc4394873c9509323762a93e97c04c03",
|
||||
"sha256:e2b46e092ea54b732d98c476720386ff2ccd126de1e52076b470b117bff7e409",
|
||||
"sha256:e334c4f39a2863a239d38b5829e442a87f241a92da9941861ee6ec5d6380b7fe",
|
||||
"sha256:e5c54f04ca42bbb5153aec5d4f2e3d9f81e316945220ac318abd4083308143f5",
|
||||
"sha256:f96333f9d2517c752c20a35ff95de5fc2763ac8cdb1653df0f6f45d281620606"
|
||||
],
|
||||
"version": "==3.10.1"
|
||||
},
|
||||
"pyyaml": {
|
||||
"hashes": [
|
||||
"sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
|
||||
"sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
|
||||
"sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
|
||||
"sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
|
||||
"sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
|
||||
"sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
|
||||
"sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
|
||||
"sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
|
||||
"sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
|
||||
"sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
|
||||
"sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==5.3.1"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
|
||||
"sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.23.0"
|
||||
},
|
||||
"tabulate": {
|
||||
"hashes": [
|
||||
"sha256:ac64cb76d53b1231d364babcd72abbb16855adac7de6665122f97b593f1eb2ba",
|
||||
"sha256:db2723a20d04bcda8522165c73eea7c300eda74e0ce852d9022e0159d7895007"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.8.7"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527",
|
||||
"sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"
|
||||
],
|
||||
"version": "==1.25.9"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
# Matrix Archive Tools
|
||||
|
||||
Import messages from a matrix.org room, for research, archival, and
|
||||
preservation.
|
||||
|
||||
Developed at [Dinacon 2018](https://www.dinacon.org), for use by the
|
||||
documentation team.
|
||||
|
||||
Use this responsibly and ethically. Don't re-publish people's messages
|
||||
without their knowledge and consent.
|
||||
|
||||
## Setup
|
||||
|
||||
Install Pipenv. Run `pipenv install`.
|
||||
|
||||
Set these environment variables: `MATRIX_USER`, `MATRIX_PASSWORD`,
|
||||
`MATRIX_ROOM_IDS`.
|
||||
|
||||
`MATRIX_ROOM_IDS` should be a comma-separated list of Matrix room IDs (or a
|
||||
single id). Run `pipenv run list_rooms.py` to list the room ids.
|
||||
|
||||
Set `MONGODB_URI` to a MongoDB connection URL, *or* install a local MongoDB
|
||||
instance.
|
||||
|
||||
## Usage
|
||||
|
||||
### Import Messages
|
||||
|
||||
`pipenv run import` imports the messages into the database.
|
||||
|
||||
### Export Messages
|
||||
|
||||
`pipenv run export filename.html` exports a text, HTML, JSON, or YAML file,
|
||||
depending on the name of `filename.html`. The file contains links to the image
|
||||
download URLs on the Matrix server.
|
||||
|
||||
### Download Images
|
||||
|
||||
`pipenv run download_images.py` downloads all the thumbnail images in the
|
||||
database into a download directory (default `thumbnails`), skipping images that
|
||||
have already been downloaded.
|
||||
|
||||
Use the `--no-thumbnails` option to download full size images instead of
|
||||
thumbnails. In this case, the default directory is `images` instead of
|
||||
`thumbnails`.
|
||||
|
||||
## References
|
||||
|
||||
[Matrix Client-Server API](https://matrix.org/docs/spec/r0.0.0/client_server.html)
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,18 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
from mongoengine import connect
|
||||
|
||||
MONGODB_URI = os.getenv('MONGODB_URI')
|
||||
MONGO_RE = (r'mongodb://'
|
||||
r'(?P<username>.+?)'
|
||||
r':(?P<password>.+?)'
|
||||
r'@(?P<host>(?:.+?):(?:\d+))'
|
||||
r'/(?P<db>.+)')
|
||||
|
||||
if MONGODB_URI:
|
||||
print(f"Connecting to {MONGODB_URI}")
|
||||
connect_args = re.match(MONGO_RE, MONGODB_URI).groupdict()
|
||||
connect(**connect_args)
|
||||
else:
|
||||
connect('matrix')
|
@ -0,0 +1,63 @@
|
||||
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import click
|
||||
import requests
|
||||
|
||||
import database_connection # noqa: F401
|
||||
from matrix_connection import get_download_url
|
||||
from schema import Message
|
||||
|
||||
|
||||
def download_stem(message, prefer_thumbnails):
|
||||
image_url = (message.thumbnail_url if prefer_thumbnails else None) \
|
||||
or message.image_url
|
||||
return urlparse(image_url).path.lstrip('/')
|
||||
|
||||
|
||||
def run_downloads(messages, download_dir, prefer_thumbnails):
|
||||
for msg in messages:
|
||||
image_url = (msg.thumbnail_url if prefer_thumbnails else None) or msg.image_url
|
||||
res = requests.head(get_download_url(image_url))
|
||||
assert res.status_code == 200
|
||||
mtype, subtype = res.headers['content-type'].split('/', 2)
|
||||
if mtype != 'image':
|
||||
print(f"Skipping {image_url}: {res.headers['content-type']}")
|
||||
continue
|
||||
|
||||
res = requests.get(get_download_url(image_url))
|
||||
assert res.status_code == 200
|
||||
filename = (download_dir / download_stem(msg, prefer_thumbnails)
|
||||
).with_suffix('.' + subtype)
|
||||
print('Downloading', image_url, '->', filename)
|
||||
with open(filename, 'wb') as fp:
|
||||
fp.write(res.content)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--thumbnails/--no-thumbnails', default=True)
|
||||
@click.argument('output', required=False)
|
||||
def download_images(thumbnails, output):
|
||||
"""Download thumbnails."""
|
||||
noun = 'thumbnails' if thumbnails else 'images'
|
||||
download_dir = Path(output or noun)
|
||||
messages = [msg for msg in Message.objects
|
||||
if msg.content.get('msgtype') == 'm.image']
|
||||
download_dir.mkdir(exist_ok=True)
|
||||
current_stems = {p.stem for p in download_dir.glob('*')}
|
||||
new_messages = [msg for msg in messages
|
||||
if download_stem(msg, thumbnails)
|
||||
not in current_stems]
|
||||
skip_count = len(messages) - len(new_messages)
|
||||
if skip_count:
|
||||
print(f"Skipping {skip_count} already-downloaded {noun}")
|
||||
if new_messages:
|
||||
print(f"Downloading {len(new_messages)} new {noun}...")
|
||||
else:
|
||||
print("Nothing to do")
|
||||
run_downloads(new_messages, download_dir, prefer_thumbnails=thumbnails)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
download_images()
|
@ -0,0 +1,80 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import click
|
||||
import yaml
|
||||
from jinja2 import Template
|
||||
|
||||
import database_connection # noqa: F401
|
||||
from matrix_connection import get_download_url
|
||||
from schema import Message
|
||||
|
||||
MATRIX_ROOM_IDS = os.environ['MATRIX_ROOM_IDS'].split(',')
|
||||
|
||||
ARCHIVE_FORMATS = ['txt', 'html', 'json', 'yaml']
|
||||
|
||||
|
||||
def encode_message(message):
|
||||
data = message._data.copy()
|
||||
data.pop('id')
|
||||
data['sender'] = re.sub(r'@(.+):.+', r'\1', data['sender'])
|
||||
data['timestamp'] = data['timestamp'].isoformat()
|
||||
content = data['content']
|
||||
if 'url' in content:
|
||||
content['url'] = get_download_url(content['url'])
|
||||
return data
|
||||
|
||||
|
||||
def replace_by_local_image(data):
|
||||
data = data.copy()
|
||||
content = data['content']
|
||||
if content.get('msgtype') == 'm.image' and 'info' in content:
|
||||
url = content['file']['url'] if 'file' in content else content['url']
|
||||
mimetype = content['info']['mimetype']
|
||||
if 'thumbnail_url' in content['info'] and content['info']['thumbnail_url'] != '':
|
||||
url, mimetype = content['info']['thumbnail_url'], content['info']['thumbnail_info']['mimetype']
|
||||
_, subtype = mimetype.split('/', 2)
|
||||
url = urlparse(url)
|
||||
content['url'] = 'thumbnails/' + url.path.strip('/') + '.' + subtype
|
||||
return data
|
||||
|
||||
|
||||
def dump_html_archive(data, fp, template_path):
|
||||
template = Template(Path(template_path).read_text())
|
||||
fp.write(template.render(messages=data))
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--room-id')
|
||||
@click.option('--local-images/--no-local-images', default=True)
|
||||
@click.argument('filename', default='archive.html')
|
||||
def export_archive(room_id, local_images, filename):
|
||||
if room_id and not re.match(r'!.+:matrix.org', room_id):
|
||||
from matrix_connection import matrix_client
|
||||
rooms = matrix_client().get_rooms()
|
||||
room_id = next(id for id, room in rooms.items() if room_id in room.display_name)
|
||||
if not room_id:
|
||||
room_id, *_ = MATRIX_ROOM_IDS
|
||||
fmt = Path(filename).suffix.lstrip('.')
|
||||
if fmt not in ARCHIVE_FORMATS:
|
||||
raise click.BadParameter(f"{fmt} is not in {ARCHIVE_FORMATS}")
|
||||
messages = Message.objects(room_id=room_id).order_by('timestamp')
|
||||
data = map(encode_message, messages)
|
||||
print(f"Writing {len(messages)} messages to {filename!r}")
|
||||
with open(filename, 'w') as fp:
|
||||
if fmt in ('text', 'txt', 'html'):
|
||||
if local_images:
|
||||
data = map(replace_by_local_image, data)
|
||||
template_path = f'templates/default.{fmt}.tpl'
|
||||
dump_html_archive(data, fp, template_path=template_path)
|
||||
elif fmt == 'json':
|
||||
json.dump(list(data), fp, indent=2)
|
||||
elif fmt == 'yaml':
|
||||
yaml.dump(list(data), fp, default_flow_style=None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
export_archive()
|
@ -0,0 +1,86 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
from itertools import islice
|
||||
|
||||
import click
|
||||
|
||||
import database_connection # noqa: F401
|
||||
from matrix_connection import matrix_client
|
||||
from mongoengine.errors import FieldDoesNotExist, ValidationError
|
||||
from schema import Message
|
||||
|
||||
MATRIX_ROOM_IDS = os.environ['MATRIX_ROOM_IDS'].split(',')
|
||||
|
||||
MESSAGE_EVENT_TYPES = {'m.room.message', 'm.room.message.feedback'}
|
||||
|
||||
|
||||
def get_room_events(room_id):
|
||||
"""Iterate room events, starting at the cursor."""
|
||||
room = matrix_client().get_rooms()[room_id]
|
||||
print(f"Reading events from room {room.display_name!r}…")
|
||||
yield from room.events
|
||||
batch_size = 1000 # empirically, this is the largest honored value
|
||||
prev_batch = room.prev_batch
|
||||
while True:
|
||||
res = room.client.api.get_room_messages(room.room_id, prev_batch, 'b',
|
||||
limit=batch_size)
|
||||
events = res['chunk']
|
||||
if not events:
|
||||
break
|
||||
print(f"Read {len(events)} events...")
|
||||
yield from events
|
||||
prev_batch = res['end']
|
||||
|
||||
|
||||
def import_events(room_id, limit=None):
|
||||
events = get_room_events(room_id)
|
||||
# restrict to messages
|
||||
messages = (event for event in events if event['type'] in MESSAGE_EVENT_TYPES)
|
||||
# exclude redacted messages
|
||||
messages = (event for event in messages if 'redacted_because' not in event)
|
||||
# exclude messages that have already been saved
|
||||
messages = (event for event in messages
|
||||
if not Message.objects(event_id=event['event_id'],
|
||||
room_id=event['room_id']))
|
||||
if limit:
|
||||
messages = islice(messages, limit)
|
||||
for event in messages:
|
||||
fields = event.copy()
|
||||
fields['messageType'] = fields.pop('type')
|
||||
fields['room_id'] = room_id
|
||||
fields['timestamp'] = datetime.fromtimestamp(
|
||||
fields.pop('origin_server_ts') / 1000)
|
||||
fields.pop('age', None)
|
||||
fields.pop('unsigned', None)
|
||||
try:
|
||||
message = Message(**replace_dots(fields))
|
||||
message.save()
|
||||
except (FieldDoesNotExist, ValidationError):
|
||||
print(fields)
|
||||
raise
|
||||
|
||||
yield message
|
||||
|
||||
|
||||
def replace_dots(obj):
|
||||
"""Recursively replace '.' by '•' in dictionary key names, to avoid mongodb
|
||||
error.
|
||||
"""
|
||||
return {k.replace('.', '•'): replace_dots(v) for k, v in obj.items()} \
|
||||
if isinstance(obj, dict) \
|
||||
else obj
|
||||
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--limit', type=int)
|
||||
def cli(limit):
|
||||
"""Import events."""
|
||||
for room_id in MATRIX_ROOM_IDS:
|
||||
import_count = sum(1 for _ in import_events(room_id, limit))
|
||||
print(f"Imported {import_count} messages")
|
||||
print(f"The database now has {Message.objects.count()} messages")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
@ -0,0 +1,23 @@
|
||||
import re
|
||||
|
||||
import click
|
||||
|
||||
from matrix_connection import matrix_client
|
||||
from tabulate import tabulate
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('pattern', required=False, type=str)
|
||||
def list_rooms(pattern):
|
||||
"""List room ids and keys."""
|
||||
rooms = matrix_client().get_rooms()
|
||||
data = [(rid, room.display_name)
|
||||
for rid, room in rooms.items()]
|
||||
if pattern:
|
||||
data = [(rid, name) for rid, name in data
|
||||
if re.search(pattern.strip('/'), name)]
|
||||
print(tabulate(data, headers=['Room ID', 'Display Name']))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
list_rooms()
|
@ -0,0 +1,36 @@
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from matrix_client.client import MatrixClient
|
||||
|
||||
MATRIX_USER = os.environ['MATRIX_USER']
|
||||
MATRIX_PASSWORD = os.environ['MATRIX_PASSWORD']
|
||||
MATRIX_HOST = os.environ.get('MATRIX_HOST', "https://matrix.org")
|
||||
|
||||
_client = None
|
||||
_download_url_resolvers = dict()
|
||||
|
||||
|
||||
def matrix_client():
|
||||
global _client
|
||||
if _client:
|
||||
return _client
|
||||
print(f"Signing into {MATRIX_HOST}...")
|
||||
client = MatrixClient(MATRIX_HOST)
|
||||
client.login_with_password(username=MATRIX_USER,
|
||||
password=MATRIX_PASSWORD)
|
||||
_client = client
|
||||
return client
|
||||
|
||||
|
||||
def get_download_url(url):
|
||||
u = urlparse(url)
|
||||
assert u.scheme == 'mxc'
|
||||
host = u.netloc
|
||||
resolvers = _download_url_resolvers
|
||||
resolver = resolvers.get(host) or MatrixClient(host).api.get_download_url
|
||||
resolvers[host] = resolver
|
||||
return 'https://' + resolver(url)
|
||||
|
||||
|
||||
get_matrix_download_url = MatrixClient(MATRIX_HOST).api.get_download_url
|
@ -0,0 +1,25 @@
|
||||
from mongoengine import DateTimeField, Document, DynamicField, StringField, BooleanField
|
||||
|
||||
|
||||
class Message(Document):
|
||||
room_id = StringField(r'!.+:.+', required=True)
|
||||
event_id = StringField(r'\$.+', required=True, unique_with='room_id')
|
||||
sender = StringField(r'@.+:.+', required=True)
|
||||
user_id = StringField(r'@.+:.+', required=False)
|
||||
messageType = StringField(r'm\.room\.message', db_field='type', required=True)
|
||||
timestamp = DateTimeField(required=True)
|
||||
content = DynamicField(required=True)
|
||||
verified = BooleanField(required=False)
|
||||
decrypted = BooleanField(required=False)
|
||||
|
||||
def is_image(self):
|
||||
return self.content.get('msgtype') == 'm.image'
|
||||
|
||||
@property
|
||||
def image_url(self):
|
||||
return self.content['url'] if self.is_image() else None
|
||||
|
||||
@property
|
||||
def thumbnail_url(self):
|
||||
return (self.content['info'].get('thumbnail_url')
|
||||
if self.is_image() else None)
|
@ -0,0 +1,3 @@
|
||||
[flake8]
|
||||
ignore = D100,D101,D102,D103,D104
|
||||
max-line-length = 88
|
@ -0,0 +1,19 @@
|
||||
<meta charset="UTF-8">
|
||||
{% for message in messages %}
|
||||
{% set content = message.content %}
|
||||
<div class="message">
|
||||
<dl>
|
||||
<dt>From</dt>
|
||||
<dd>{{ message.sender }}</dd>
|
||||
<dt>Date</dt>
|
||||
<dd>{{ message.timestamp }}</dd>
|
||||
</dl>
|
||||
{% if content.msgtype == 'm.text' %}
|
||||
<div class="body">{{ content.body }}</div>
|
||||
{% elif content.msgtype == 'm.image' %}
|
||||
<div class="body"><img src="{{ content.url }}" /></div>
|
||||
{% else %}
|
||||
<div class="error">Unknown message type<div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
@ -0,0 +1,13 @@
|
||||
{% for message in messages -%}
|
||||
{%- set content = message.content -%}
|
||||
From {{ message.sender }}
|
||||
Date {{ message.timestamp }}
|
||||
{% if content.msgtype == 'm.text' %}
|
||||
{{ content.body }}
|
||||
{%- elif content.msgtype == 'm.image' -%}
|
||||
Image: {{ content.url }}
|
||||
{%- else -%}
|
||||
Unknown type: {{ content.msgtype }}
|
||||
{%- endif %}
|
||||
---
|
||||
{% endfor %}
|
Loading…
Reference in New Issue