kmx git

#!/usr/bin/env python3

import requests, datetime
import youtube_dl
import argparse, sys
import json
import os, sys
import time
import logging
import cv2
from imagededup.methods import PHash

'''
Notes pour l'algo de suppression des doublons (sinon dans 10jours j'arriverais plus à piger)

- Extraire et enregistrer la première image de chaque vidéo téléchargée sous forme de fichier et dans un dico
- Utiliser la lib imagededup pour obtenir le dictionnaire de tous les doublons et originaux dans le dossier de travail
- Itérer dans le dictionnaire
- Si la valeur est dans le dico, supprimer les vidéos associées aux images en double ainsi que les images en double
- Ensuite, supprimez le cadre d'origine
- Si la valeur n'est pas dans le dict des frames, supprimez tous les doublons 

os.path'''

args = {}
url_list = []

# pushshift helper function
def get_posts(post_type,params, cb, limit=-1):
    if limit != -1:
        if limit >= 100:
            size = 100
        else:
            size = limit
    else:
        size = 100
    last = int(datetime.datetime.now().timestamp())
    got = 0
    while True:
        logging.info(f"Fetching posts made before {last}")
        req_params = {
                **params,
                'size':size,
                'before':last
                }
        req_headers = {
                'User-Agent':'Python requests - Redditstat.py'
                }
        res = requests.get(f'https://api.pushshift.io/reddit/{post_type}/search', params=req_params, headers=req_headers)
        res.raise_for_status()
        data = res.json()["data"]
        cb(data)
        if len(data) < 100 or (limit != -1 and got >= limit):
            got += len(data)
            logging.info(f"Total of {got} posts fetched from u/{params['author']}")
            return
        else:
            last = data[-1]["created_utc"]
            got += 100

def submission_callback(data):
    print (len(data))
    for post in data:
        process_submission(post)

def process_submission(post):
    global url_list
    try:
        if not post['is_self'] and post['url'] not in url_list:
            if not post['is_video'] and "gif" not in post['url']:
                res = requests.get(post['url'])
                if(res):
                    print("Downloading file")
                    print (post['url'])
                    with open(f"{post['author']}/{datetime.datetime.now().strftime('%Y-%m-%dT%H%M%S')}-{post['url'].split('/')[-1]}", "wb+") as f:
                        f.write(res.content)
                        logging.info(f"Photo downloaded from {post['url']} and saved to {f.name}")
            else:
                print("Downloading video")
                with youtube_dl.YoutubeDL({'outtmpl':f"{post['author']}/{datetime.datetime.now().strftime('%Y-%m-%dT%H%M%S')}-%(id)s.%(ext)s", 'max_downloads': 1}) as ydl:
                    try:
                        info_dict = ydl.extract_info(post['url'], download=False)
                        fn = os.path.basename(ydl.prepare_filename(info_dict))
                        ydl.download([post['url']])
                        logging.info(f"Video downloaded from {post['url']} and saved to {fn}")
                    except (youtube_dl.utils.DownloadError, youtube_dl.utils.MaxDownloadsReached):
                        print("Unable to download")
    except KeyError:
        print("What?")
    url_list.append(post['url'])

def extractFirstFrame(cwd):
    logging.info("Beginning extraction of first frame from videos in the folder")
    videos = []
    for file in os.listdir(cwd):
        if file.endswith(".mp4"):
            videos.append(file)
    print (videos)
    video_images = {}
    for video in videos:
        vidcap = cv2.VideoCapture(cwd+video)
        success, image = vidcap.read()
        if success:
           cv2.imwrite(cwd+video+".jpg", image)
           video_images[os.path.basename(video)+".jpg"] = os.path.basename(video)
    return video_images
    
       
def removeDuplicates(duplicates, video_frames, images_dir): 
    for image in duplicates:
        if image in video_frames:
            #delete the duplicate image videos then the images 
            if duplicates[image]:
                for img in duplicates[image]:
                    try:
                        os.remove(images_dir + video_frames[img])
                        os.remove(images_dir + img)
                        logging.info(f"Duplicate video found. Deleting {video_frames[img]}")
                    except FileNotFoundError as e:
                        print(e)
                    duplicates[img] = []
            try:
                os.remove(images_dir + image)
            except FileNotFoundError as e:
                print (e)
        else:
            if duplicates[image]:
                for dup in duplicates[image]:
                    try:
                        os.remove(images_dir + dup)
                        logging.info(f"Duplicate picture found. Deleting {dup}")
                    except FileNotFoundError:
                        print(images_dir + dup + " not found")
                    duplicates[dup] = []
            
        

def main():
    global args
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level='INFO', filename='execution.log')
    parser = argparse.ArgumentParser(description="Download reddit media")
    parser.add_argument('-u', '--user', help="USER to download from")
    parser.add_argument('-s', '--subreddit', help="SUBREDDIT to download from")
    parser.add_argument('--ydl-args', help="JSON-format youtube-dl options", default='{}')
    parser.add_argument('-l','--limit',help="Maximum number of posts to be downloaded")
    parser.add_argument('--pushshift-params', help="JSON-formatted pushshift parameters", default='{}')
    args = parser.parse_args()
    logging.info(f"\n\n{'-'*30}\nBeginning download of media from user u/{args.user}")
    try:
       os.makedirs(args.user)
       logging.info(f"Created folder for reddit user {args.user}")
    except OSError as e:
       logging.info(f"Folder already exists for reddit user {args.user}")
       print (e)
    if args.limit:
        get_posts('submission', {**json.loads(args.pushshift_params), 'subreddit':args.subreddit, 'author':args.user}, submission_callback, int(args.limit))
    else:
        get_posts('submission', {**json.loads(args.pushshift_params), 'subreddit':args.subreddit, 'author':args.user}, submission_callback)
    #get working directory
    cwd = os.getcwd()
    images_dir = cwd + "/" + args.user + "/"
    #get dict of video first frames
    video_frames = extractFirstFrame(images_dir)
    #get dict of all duplicates in directory
    logging.info("Beginning hashing function to create dict of duplicates")
    phasher = PHash()
    encodings = phasher.encode_images(image_dir=images_dir)
    duplicates = phasher.find_duplicates(encoding_map=encodings)
    print (video_frames)
    print("\n\n")
    print(duplicates)
    removeDuplicates(duplicates, video_frames, images_dir)
    logging.info("Execution complete. Exiting...")
    sys.exit()


if __name__ == "__main__":
    main()
s0ca/Reddit_Downloader/main.py

Commit

main.py