Comparing the number of new Python2 and Python3 packages on PyPi

Posted on Sun 08 January 2017 in Python

I didn't found information about the current activity around different Python versions. So, I decided to periodically collect the number of PyPI packages from the official site to get insight to the current popularity of Python2 vs. Python3.

Here are the results: Number of new Python2 and Python3 packages on PyPi

Technical details

The number of packages in PyPi categories are publicly available.

Data scraper script

from sys import stdout
from time import sleep, time
import requests
from lxml import html

PYPI_CATEGORIES_URL = 'https://pypi.python.org/pypi?%3Aaction=browse'
PY_VERSIONS_XPATH = '//*[text()="Programming Language :: Python"]/../ul[1]/li'


def fetch():
    while 1:
        resp = requests.get(PYPI_CATEGORIES_URL)
        dom = html.fromstring(resp.text.encode('utf-8'))
        stdout.write('timestamp: {}, '.format(int(time())))
        print(', '.join(': '.join(x.strip().replace(')', '').replace('(', '')
                                  for x in v.xpath('.//text()') if x.strip())
                        for v in dom.xpath(PY_VERSIONS_XPATH)))
        stdout.flush()
        sleep(120)


fetch()

Plotting the results

The chart was created with matplotlib. Custom configuration is available in my dotfiles repo.

from collections import defaultdict
from datetime import datetime
from sys import argv
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

date_formatter = DateFormatter('%Y.%m.%d\n%H:%M')

data = defaultdict(list)
base = {}
dates = []

tags = ['2', '2.6', '2.7', '3', '3.4', '3.5']

with open(argv[1]) as infile:
    for line in infile.readlines():
        cols = [x.split(': ') for x in line.strip().split(', ')]
        # sometimes PyPi gives empty response, skipping
        try:
            timestamp = int(cols[0][1])
            row_data = dict(x for x in cols[1:] if x[0] in tags)
        except:
            continue

        row_values = defaultdict(int)
        for col_name, col_value in row_data.items():
            c = col_name[0]
            col_value = int(col_value)
            if col_name not in base:
                base[col_name] = col_value

            row_values[c] += col_value-base[col_name]

        # skip lines with same data
        if data and all(data[x][-1] == row_values[x] for x in row_values):
            continue

        for c in row_values:
            data[c].append(row_values[c])
        dates.append(datetime.fromtimestamp(timestamp))


fig, ax = plt.subplots(1)

for py_version in data:
    d = data[py_version]
    plt.plot(dates, data[py_version], label='Python{}'.format(py_version))

plt.legend()
plt.gcf().axes[0].xaxis.set_major_formatter(date_formatter)
ax.set_title("PyPi package number changes")
fig.savefig('xy.png')