Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support flipped images #17

Merged
merged 8 commits into from
Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,24 @@ sudo systemctl disable mongodb.service
sudo service mongodb stop
```

Alternatively, install MongoDB from https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/

sudo apt-get install -y mongodb-org
sudo systemctl status mongod

See also https://stackoverflow.com/questions/37565758/mongodb-not-working-on-ubuntu-mongod-service-failed-with-result-exit-code and https://medium.com/@balasubramanim/how-to-resolve-socketexception-address-already-in-use-mongodb-75fa8ea4a2a6 Note that restarting has to be done with

sudo systemctl start mongod

Workaround if problems keep arising is to run duplicate_finder.py with sudo. When
running as root and http://127.0.0.1:5000 give the error 'Not Found', open with
Firefox (not Chrome or Chromium from flathub) the URL file:///tmp/tmp......../0.html
Perhaps even do a chown -R yourusername.yourusername on /tmp/tmp......../ first.

In case pip3 has problems:

apt-get install -y python3-numpy python3-scipy python3-pywt

Python 2 is the default version of Python, so we have to call `python3` explicitely:

```bash
Expand Down
37 changes: 32 additions & 5 deletions duplicate_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
A tool to find and remove duplicate pictures.

Usage:
duplicate_finder.py add <path> ... [--db=<db_path>] [--parallel=<num_processes>]
duplicate_finder.py add <path> ... [--flipped] [--nomax] [--db=<db_path>] [--parallel=<num_processes>]
duplicate_finder.py remove <path> ... [--db=<db_path>]
duplicate_finder.py clear [--db=<db_path>]
duplicate_finder.py show [--db=<db_path>]
Expand All @@ -18,6 +18,9 @@
--parallel=<num_processes> The number of parallel processes to run to hash the image
files (default: number of CPUs).

add:
--flipped Also add flipped images
--nomax No maximum file size
find:
--print Only print duplicate files rather than displaying HTML file
--delete Move all found duplicate pictures to the trash. This option takes priority over --print.
Expand All @@ -28,6 +31,7 @@

import concurrent.futures
from contextlib import contextmanager
from datetime import datetime
import os
import magic
import math
Expand All @@ -46,6 +50,7 @@
import pymongo
from termcolor import cprint

FLIPPED = False

@contextmanager
def connect_to_db(db_conn_string='./db'):
Expand Down Expand Up @@ -122,6 +127,7 @@ def hash_file(file):
img = Image.open(file)

file_size = get_file_size(file)
file_time = get_file_time(file)
image_size = get_image_size(img)
capture_time = get_capture_time(img)

Expand All @@ -131,12 +137,20 @@ def hash_file(file):
turned_img = img.rotate(angle, expand=True)
else:
turned_img = img
hashes.append(str(imagehash.phash(turned_img)))
string = str(imagehash.phash(turned_img))
if string not in hashes:
hashes.append(string)
# also hash flipped image
if FLIPPED:
flipped_img = turned_img.transpose(method=Image.FLIP_LEFT_RIGHT)
string = str(imagehash.phash(flipped_img))
if string not in hashes:
hashes.append(string)

hashes = ''.join(sorted(hashes))

cprint("\tHashed {}".format(file), "blue")
return file, hashes, file_size, image_size, capture_time
return file, hashes, file_size, file_time, image_size, capture_time
except OSError:
cprint("\tUnable to open {}".format(file), "red")
return None
Expand All @@ -149,11 +163,12 @@ def hash_files_parallel(files, num_processes=None):
yield result


def _add_to_database(file_, hash_, file_size, image_size, capture_time, db):
def _add_to_database(file_, hash_, file_size, file_time, image_size, capture_time, db):
try:
db.insert_one({"_id": file_,
"hash": hash_,
"file_size": file_size,
"file_time": file_time,
"image_size": image_size,
"capture_time": capture_time})
except pymongo.errors.DuplicateKeyError:
Expand Down Expand Up @@ -228,6 +243,7 @@ def find(db, match_time=False):
"$push": {
"file_name": "$_id",
"file_size": "$file_size",
"file_time": "$file_time",
"image_size": "$image_size",
"capture_time": "$capture_time"
}
Expand Down Expand Up @@ -288,7 +304,7 @@ def render(duplicates, current, total):

with TemporaryDirectory() as folder:
# Generate all of the HTML files
chunk_size = 25
chunk_size = 50
for i, dups in enumerate(chunked(duplicates, chunk_size)):
with open('{}/{}.html'.format(folder, i), 'w') as f:
f.write(render(dups,
Expand All @@ -311,6 +327,13 @@ def get_file_size(file_name):
return 0


def get_file_time(file_name):
try:
return datetime.fromtimestamp(os.path.getmtime(file_name)).strftime('%Y:%m:%d, %H:%M:%S')
except FileNotFoundError:
return 0


def get_image_size(img):
return "{} x {}".format(*img.size)

Expand Down Expand Up @@ -349,6 +372,10 @@ def get_capture_time(img):

with connect_to_db(db_conn_string=DB_PATH) as db:
if args['add']:
if args['--flipped']:
FLIPPED = True
if args['--nomax']:
Image.MAX_IMAGE_PIXELS = None
add(args['<path>'], db, NUM_PROCESSES)
elif args['remove']:
remove(args['<path>'], db)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ python-magic==0.4.27
scipy==1.5.3
setuptools==65.4.0
six==1.15.0
termcolor==2.0.1
termcolor==2.0.1
9 changes: 7 additions & 2 deletions template/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@
{% macro image(img, size) -%}
<div class="col-xs-{{ size }}">
<div class="thumbnail">
<img class="img-responsive" src="{{ img['file_name'] }}" alt="{{ img['file_name'] }}">
<a target="_blank" href="{{ img['file_name'] }}"><img class="img-responsive" src="{{ img['file_name'] }}" alt="{{ img['file_name'] }}"></a>
<div class="caption">
<h5 class="name">{{ img['file_name'] }}</h5>
<div class="file-size">{{ img['file_size'] | filesizeformat }}</div>
<div class="resolution">{{ img['image_size'] }}</div>
<div class="capture-time">{{ img['capture_time'] }}</div>
<div class="file-time">FILE {{ img['file_time'] }}</div>
{% if img['capture_time'] == 'Time unknown' %}
<div class="capture-time">EXIF <strong>{{ img['capture_time'] }}</strong></div>
{% else %}
<div class="capture-time">EXIF {{ img['capture_time'] }}</div>
{% endif %}
<button class="btn btn-danger delete-btn" role="button" data-name="{{ img['file_name'] }}" style="margin-top: 15px">
Delete
</button>
Expand Down