From 4ee9f6d5f3ac32640e421783d4c7cc79ef63962d Mon Sep 17 00:00:00 2001 From: moonman Date: Tue, 8 Apr 2014 01:58:19 -0600 Subject: [PATCH] core/linux-raspberrypi-latest to 3.14-1 --- core/linux-raspberrypi-latest/PKGBUILD | 27 +- core/linux-raspberrypi-latest/config | 96 +- .../uksm-0.1.2.2-for-v3.13.patch | 7064 ----------------- 3 files changed, 83 insertions(+), 7104 deletions(-) delete mode 100644 core/linux-raspberrypi-latest/uksm-0.1.2.2-for-v3.13.patch diff --git a/core/linux-raspberrypi-latest/PKGBUILD b/core/linux-raspberrypi-latest/PKGBUILD index 5afad92a8..fa1857a1b 100644 --- a/core/linux-raspberrypi-latest/PKGBUILD +++ b/core/linux-raspberrypi-latest/PKGBUILD @@ -5,11 +5,11 @@ buildarch=18 pkgbase=linux-raspberrypi-latest -_commit=23a799541065ff1698599d31d0601a4a3042c569 +_commit=dbddcf763ee7cecef28a677494c4f2c21e1137b7 _srcname=linux-${_commit} _kernelname=${pkgbase#linux} _desc="Raspberry Pi" -pkgver=3.13.9 +pkgver=3.14 pkgrel=1 bfqver=v7r2 @@ -20,6 +20,7 @@ makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'uboot-mkimage' 'git' 'pyt options=('!strip') source=("https://github.com/raspberrypi/linux/archive/${_commit}.tar.gz" 'config' + '0001-Bluetooth-allocate-static-minor-for-vhci.patch' 'change-default-console-loglevel.patch' 'args-uncompressed.txt' 'boot-uncompressed.txt' @@ -30,17 +31,18 @@ source=("https://github.com/raspberrypi/linux/archive/${_commit}.tar.gz" "http://algo.ing.unimo.it/people/paolo/disk_sched/patches/${pkgver:0:4}.0-${bfqver}/0002-block-introduce-the-BFQ-${bfqver}-I-O-sched-for-${pkgver:0:4}.patch" "http://algo.ing.unimo.it/people/paolo/disk_sched/patches/${pkgver:0:4}.0-${bfqver}/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-${bfqver}-for-${pkgver:0:4}.0.patch") -md5sums=('1a34ddcc9a84deb3d4d8c30347e2499a' - '2c08a3c84805d8d41c02257eb37bd113' +md5sums=('cfe61f6c40b3a494d1d15231b18f3d07' + '5941f9cab8be1c5b80935c8ff9a66601' + '1b276abe16d14e133f3f28d9c9e6bd68' 'c0d63f4d42fd334bf31e4f749d4694f3' '9335d1263fd426215db69841a380ea26' 'a00e424e2fbb8c5a5f77ba2c4871bed4' '2f82dbe5752af65ff409d737caf11954' '8e91bb5369dabec81bcd28b8d6484898' 'SKIP' - 'af879e680d2b723284bed9c660d9bba8' - '64b67eb8d84c94fc3929cceaf8d37895' - '5c6962340e5617086f4bb9513918bd08') + '42aec4f3943140e8346a778f8967b8e5' + 'f1eea13a89f4dbcb5cb1ae3d097ef6a0' + '0bccf1a2c80875e70287e20b8c2a8b19') prepare() { cd "${srcdir}/${_srcname}" @@ -56,10 +58,10 @@ msg "Patches:" cp ${srcdir}/boot-uncompressed.txt arch/arm/boot/ cp ${srcdir}/imagetool-uncompressed.py arch/arm/boot/ - msg2 "Disable out of tree RTL8192 driver and enable rtlwifi" + msg2 "Disable out of tree RTL8192 driver and enable rtlwifi" patch -Np1 -i "${srcdir}/disable-8192cu.patch" - msg2 "AUFS patches" + msg2 "AUFS patches" cp -ru "${srcdir}/aufs3-standalone/Documentation" . cp -ru "${srcdir}/aufs3-standalone/fs" . cp -ru "${srcdir}/aufs3-standalone/include/uapi/linux/aufs_type.h" ./include/linux @@ -70,7 +72,10 @@ msg "Patches:" patch -Np1 -i ../aufs3-standalone/aufs3-mmap.patch patch -Np1 -i ../aufs3-standalone/aufs3-standalone.patch - msg2 "Add BFQ patches" + msg2 "Fix vhci warning in kmod" + patch -p1 -i "${srcdir}/0001-Bluetooth-allocate-static-minor-for-vhci.patch" + + msg2 "Add BFQ patches" patch -Np1 -i "${srcdir}/0001-block-cgroups-kconfig-build-bits-for-BFQ-${bfqver}-${pkgver:0:4}.patch" patch -Np1 -i "${srcdir}/0002-block-introduce-the-BFQ-${bfqver}-I-O-sched-for-${pkgver:0:4}.patch" patch -Np1 -i "${srcdir}/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-${bfqver}-for-${pkgver:0:4}.0.patch" @@ -226,7 +231,7 @@ mkdir -p "${pkgdir}/usr/src/linux-${_kernver}/drivers/media/pci/${i}" cp -a drivers/media/pci/${i}/*.h "${pkgdir}/usr/src/linux-${_kernver}/drivers/media/pci/${i}" done # usb - for i in cpia2 em28xx pwc sn9c102; do + for i in cpia2 em28xx pwc; do mkdir -p "${pkgdir}/usr/src/linux-${_kernver}/drivers/media/usb/${i}" cp -a drivers/media/usb/${i}/*.h "${pkgdir}/usr/src/linux-${_kernver}/drivers/media/usb/${i}" done diff --git a/core/linux-raspberrypi-latest/config b/core/linux-raspberrypi-latest/config index 4ec083250..7f1339d1a 100644 --- a/core/linux-raspberrypi-latest/config +++ b/core/linux-raspberrypi-latest/config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/arm 3.13.9-1 Kernel Configuration +# Linux/arm 3.14.0-1 Kernel Configuration # CONFIG_ARM=y CONFIG_SYS_SUPPORTS_APM_EMULATION=y @@ -131,7 +131,6 @@ CONFIG_IPC_NS=y CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y -CONFIG_UIDGID_STRICT_TYPE_CHECKS=y CONFIG_SCHED_AUTOGROUP=y CONFIG_MM_OWNER=y CONFIG_SYSFS_DEPRECATED=y @@ -187,6 +186,8 @@ CONFIG_HAVE_OPROFILE=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y # CONFIG_HAVE_64BIT_ALIGNED_ACCESS is not set +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y CONFIG_KRETPROBES=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y @@ -203,6 +204,11 @@ CONFIG_HAVE_PERF_REGS=y CONFIG_HAVE_PERF_USER_STACK_DUMP=y CONFIG_HAVE_ARCH_JUMP_LABEL=y CONFIG_ARCH_WANT_IPC_PARSE_VERSION=y +CONFIG_HAVE_CC_STACKPROTECTOR=y +# CONFIG_CC_STACKPROTECTOR is not set +CONFIG_CC_STACKPROTECTOR_NONE=y +# CONFIG_CC_STACKPROTECTOR_REGULAR is not set +# CONFIG_CC_STACKPROTECTOR_STRONG is not set CONFIG_HAVE_CONTEXT_TRACKING=y CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y @@ -232,7 +238,7 @@ CONFIG_BLOCK=y CONFIG_LBDAF=y CONFIG_BLK_DEV_BSG=y CONFIG_BLK_DEV_BSGLIB=y -# CONFIG_BLK_DEV_INTEGRITY is not set +CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_THROTTLING=y # CONFIG_BLK_CMDLINE_PARSER is not set @@ -307,8 +313,8 @@ CONFIG_ARCH_BCM2708=y # CONFIG_ARCH_W90X900 is not set # CONFIG_ARCH_LPC32XX is not set # CONFIG_ARCH_PXA is not set -# CONFIG_ARCH_MSM is not set -# CONFIG_ARCH_SHMOBILE is not set +# CONFIG_ARCH_MSM_NODT is not set +# CONFIG_ARCH_SHMOBILE_LEGACY is not set # CONFIG_ARCH_RPC is not set # CONFIG_ARCH_SA1100 is not set # CONFIG_ARCH_S3C24XX is not set @@ -403,6 +409,7 @@ CONFIG_ARCH_WANT_GENERAL_HUGETLB=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_HAVE_MEMBLOCK=y +CONFIG_NO_BOOTMEM=y CONFIG_MEMORY_ISOLATION=y # CONFIG_HAVE_BOOTMEM_INFO_NODE is not set CONFIG_PAGEFLAGS_EXTENDED=y @@ -421,11 +428,12 @@ CONFIG_CMA=y # CONFIG_CMA_DEBUG is not set # CONFIG_ZBUD is not set # CONFIG_ZSWAP is not set +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set CONFIG_FORCE_MAX_ZONEORDER=11 CONFIG_ALIGNMENT_TRAP=y CONFIG_UACCESS_WITH_MEMCPY=y CONFIG_SECCOMP=y -CONFIG_CC_STACKPROTECTOR=y CONFIG_SWIOTLB=y CONFIG_IOMMU_HELPER=y @@ -677,6 +685,7 @@ CONFIG_NF_NAT_SIP=m CONFIG_NF_NAT_TFTP=m CONFIG_NETFILTER_SYNPROXY=m CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m CONFIG_NFT_CT=m @@ -686,6 +695,9 @@ CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m CONFIG_NFT_NAT=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m CONFIG_NFT_COMPAT=m CONFIG_NETFILTER_XTABLES=m @@ -729,6 +741,7 @@ CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m # CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -745,7 +758,9 @@ CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_LIMIT=m CONFIG_NETFILTER_XT_MATCH_MAC=m @@ -793,9 +808,9 @@ CONFIG_NF_DEFRAG_IPV4=m CONFIG_NF_CONNTRACK_IPV4=m CONFIG_NF_CONNTRACK_PROC_COMPAT=y CONFIG_NF_TABLES_IPV4=m -CONFIG_NFT_REJECT_IPV4=m CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_CHAIN_NAT_IPV4=m +CONFIG_NFT_REJECT_IPV4=m CONFIG_NF_TABLES_ARP=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m @@ -832,6 +847,7 @@ CONFIG_NF_CONNTRACK_IPV6=m CONFIG_NF_TABLES_IPV6=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_CHAIN_NAT_IPV6=m +CONFIG_NFT_REJECT_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m CONFIG_IP6_NF_MATCH_EUI64=m @@ -904,6 +920,7 @@ CONFIG_LLC=m # CONFIG_LAPB is not set # CONFIG_PHONET is not set # CONFIG_IEEE802154 is not set +CONFIG_6LOWPAN_IPHC=m CONFIG_NET_SCHED=y # @@ -929,6 +946,8 @@ CONFIG_NET_SCH_QFQ=m CONFIG_NET_SCH_CODEL=m CONFIG_NET_SCH_FQ_CODEL=m # CONFIG_NET_SCH_FQ is not set +# CONFIG_NET_SCH_HHF is not set +CONFIG_NET_SCH_PIE=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m @@ -984,7 +1003,8 @@ CONFIG_VSOCKETS=m # CONFIG_NETLINK_DIAG is not set # CONFIG_NET_MPLS_GSO is not set # CONFIG_HSR is not set -CONFIG_NETPRIO_CGROUP=y +CONFIG_CGROUP_NET_PRIO=m +CONFIG_CGROUP_NET_CLASSID=y CONFIG_NET_RX_BUSY_POLL=y CONFIG_BQL=y CONFIG_BPF_JIT=y @@ -1179,6 +1199,8 @@ CONFIG_CONNECTOR=m CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_ZRAM=m +# CONFIG_ZRAM_DEBUG is not set # CONFIG_BLK_DEV_COW_COMMON is not set CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 @@ -1320,10 +1342,10 @@ CONFIG_BLK_DEV_DM=y CONFIG_DM_BUFIO=y CONFIG_DM_BIO_PRISON=y CONFIG_DM_PERSISTENT_DATA=y +# CONFIG_DM_DEBUG_BLOCK_STACK_TRACING is not set CONFIG_DM_CRYPT=y CONFIG_DM_SNAPSHOT=m CONFIG_DM_THIN_PROVISIONING=y -# CONFIG_DM_DEBUG_BLOCK_STACK_TRACING is not set # CONFIG_DM_CACHE is not set CONFIG_DM_MIRROR=y CONFIG_DM_LOG_USERSPACE=m @@ -1481,6 +1503,7 @@ CONFIG_USB_NET_HUAWEI_CDC_NCM=m CONFIG_USB_NET_CDC_MBIM=m CONFIG_USB_NET_DM9601=m CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m CONFIG_USB_NET_SMSC75XX=m CONFIG_USB_NET_SMSC95XX=y CONFIG_USB_NET_GL620A=m @@ -1525,7 +1548,7 @@ CONFIG_ATH9K=m CONFIG_ATH9K_AHB=y # CONFIG_ATH9K_DEBUGFS is not set # CONFIG_ATH9K_DFS_CERTIFIED is not set -CONFIG_ATH9K_TX99=y +# CONFIG_ATH9K_WOW is not set # CONFIG_ATH9K_LEGACY_RATE_CONTROL is not set CONFIG_ATH9K_RFKILL=y CONFIG_ATH9K_HTC=m @@ -1540,6 +1563,7 @@ CONFIG_ATH10K=m # CONFIG_ATH10K_DEBUG is not set # CONFIG_ATH10K_DEBUGFS is not set # CONFIG_ATH10K_TRACING is not set +# CONFIG_ATH10K_DFS_CERTIFIED is not set CONFIG_WCN36XX=m # CONFIG_WCN36XX_DEBUGFS is not set CONFIG_B43=m @@ -1560,6 +1584,7 @@ CONFIG_B43LEGACY_PIO=y CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y # CONFIG_B43LEGACY_DMA_MODE is not set # CONFIG_B43LEGACY_PIO_MODE is not set +# CONFIG_BRCMSMAC is not set # CONFIG_BRCMFMAC is not set CONFIG_HOSTAP=m CONFIG_HOSTAP_FIRMWARE=y @@ -1870,16 +1895,15 @@ CONFIG_I2C_NOMADIK=y # CONFIG_I2C_OCORES is not set # CONFIG_I2C_PCA_PLATFORM is not set # CONFIG_I2C_PXA_PCI is not set -# CONFIG_I2C_SH_MOBILE is not set # CONFIG_I2C_SIMTEC is not set # CONFIG_I2C_XILINX is not set -# CONFIG_I2C_RCAR is not set # # External I2C/SMBus adapter drivers # CONFIG_I2C_DIOLAN_U2C=m # CONFIG_I2C_PARPORT_LIGHT is not set +CONFIG_I2C_ROBOTFUZZ_OSIF=m # CONFIG_I2C_TAOS_EVM is not set CONFIG_I2C_TINY_USB=m @@ -1946,6 +1970,7 @@ CONFIG_GPIO_SYSFS=y # CONFIG_GPIO_GENERIC_PLATFORM is not set # CONFIG_GPIO_PL061 is not set # CONFIG_GPIO_RCAR is not set +CONFIG_GPIO_SCH311X=m # CONFIG_GPIO_TS5500 is not set # @@ -1965,9 +1990,7 @@ CONFIG_GPIO_SYSFS=y # SPI GPIO expanders: # # CONFIG_GPIO_MAX7301 is not set -# CONFIG_GPIO_MCP23S08 is not set # CONFIG_GPIO_MC33880 is not set -# CONFIG_GPIO_74X164 is not set # # AC97 GPIO expanders: @@ -2150,7 +2173,6 @@ CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y # CONFIG_THERMAL_GOV_FAIR_SHARE is not set CONFIG_THERMAL_GOV_STEP_WISE=y # CONFIG_THERMAL_GOV_USER_SPACE is not set -# CONFIG_CPU_THERMAL is not set # CONFIG_THERMAL_EMULATION is not set CONFIG_THERMAL_BCM2835=y @@ -2217,6 +2239,7 @@ CONFIG_MFD_CORE=m CONFIG_MFD_88PM800=m CONFIG_MFD_88PM805=m # CONFIG_MFD_88PM860X is not set +# CONFIG_MFD_MAX14577 is not set # CONFIG_MFD_MAX77686 is not set # CONFIG_MFD_MAX77693 is not set # CONFIG_MFD_MAX8907 is not set @@ -2237,6 +2260,7 @@ CONFIG_MFD_88PM805=m # CONFIG_MFD_STMPE is not set # CONFIG_MFD_SYSCON is not set # CONFIG_MFD_TI_AM335X_TSCADC is not set +# CONFIG_MFD_LP3943 is not set # CONFIG_MFD_LP8788 is not set # CONFIG_MFD_PALMAS is not set # CONFIG_TPS6105X is not set @@ -2291,7 +2315,6 @@ CONFIG_VIDEOBUF_DVB=m CONFIG_VIDEOBUF2_CORE=m CONFIG_VIDEOBUF2_MEMOPS=m CONFIG_VIDEOBUF2_VMALLOC=m -# CONFIG_VIDEO_V4L2_INT_DEVICE is not set CONFIG_DVB_CORE=m CONFIG_DVB_NET=y CONFIG_TTPCI_EEPROM=m @@ -2386,7 +2409,6 @@ CONFIG_VIDEO_CPIA2=m CONFIG_USB_ZR364XX=m CONFIG_USB_STKWEBCAM=m CONFIG_USB_S2255=m -CONFIG_USB_SN9C102=m CONFIG_VIDEO_USBTV=m # @@ -2467,6 +2489,7 @@ CONFIG_DVB_B2C2_FLEXCOP_USB=m # Webcam, TV (analog/digital) USB devices # CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m CONFIG_VIDEO_EM28XX_ALSA=m CONFIG_VIDEO_EM28XX_DVB=m CONFIG_VIDEO_EM28XX_RC=m @@ -2490,13 +2513,13 @@ CONFIG_RADIO_TEA575X=m CONFIG_RADIO_SI470X=y CONFIG_USB_SI470X=m # CONFIG_I2C_SI470X is not set +# CONFIG_RADIO_SI4713 is not set CONFIG_USB_MR800=m CONFIG_USB_DSBR=m CONFIG_RADIO_SHARK=m CONFIG_RADIO_SHARK2=m -# CONFIG_I2C_SI4713 is not set -# CONFIG_RADIO_SI4713 is not set CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m CONFIG_USB_MA901=m # CONFIG_RADIO_TEA5764 is not set # CONFIG_RADIO_SAA7706H is not set @@ -2565,6 +2588,10 @@ CONFIG_VIDEO_MT9V011=m # Video improvement chips # +# +# Audio/Video compression chips +# + # # Miscellaneous helper chips # @@ -2599,6 +2626,7 @@ CONFIG_MEDIA_TUNER_FC0013=m CONFIG_MEDIA_TUNER_TDA18212=m CONFIG_MEDIA_TUNER_E4000=m CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88TS2022=m CONFIG_MEDIA_TUNER_TUA9001=m CONFIG_MEDIA_TUNER_IT913X=m CONFIG_MEDIA_TUNER_R820T=m @@ -2610,6 +2638,7 @@ CONFIG_DVB_STB0899=m CONFIG_DVB_STB6100=m CONFIG_DVB_STV090x=m CONFIG_DVB_STV6110x=m +CONFIG_DVB_M88DS3103=m # # Multistandard (cable + terrestrial) frontends @@ -2745,6 +2774,7 @@ CONFIG_FB_TILEBLITTING=y CONFIG_FB_BCM2708=y # CONFIG_FB_ARMCLCD is not set # CONFIG_FB_UVESA is not set +# CONFIG_FB_OPENCORES is not set # CONFIG_FB_S1D13XXX is not set CONFIG_FB_TMIO=m CONFIG_FB_TMIO_ACCELL=y @@ -3039,7 +3069,11 @@ CONFIG_USB_STORAGE_ENE_UB6250=m # CONFIG_USB_MDC800=m CONFIG_USB_MICROTEK=m +# CONFIG_USB_MUSB_HDRC is not set # CONFIG_USB_DWC3 is not set +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set # # USB port drivers @@ -3086,6 +3120,7 @@ CONFIG_USB_SERIAL_MCT_U232=m CONFIG_USB_SERIAL_METRO=m CONFIG_USB_SERIAL_MOS7720=m CONFIG_USB_SERIAL_MOS7840=m +# CONFIG_USB_SERIAL_MXUPORT is not set CONFIG_USB_SERIAL_NAVMAN=m CONFIG_USB_SERIAL_PL2303=m CONFIG_USB_SERIAL_OTI6858=m @@ -3140,6 +3175,7 @@ CONFIG_USB_EZUSB_FX2=m # USB Physical Layer drivers # # CONFIG_USB_PHY is not set +# CONFIG_USB_OTG_FSM is not set # CONFIG_NOP_USB_XCEIV is not set # CONFIG_AM335X_PHY_USB is not set # CONFIG_SAMSUNG_USB2PHY is not set @@ -3253,6 +3289,7 @@ CONFIG_RTC_DRV_MAX6900=m CONFIG_RTC_DRV_RS5C372=m CONFIG_RTC_DRV_ISL1208=m CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12057=m CONFIG_RTC_DRV_X1205=m CONFIG_RTC_DRV_PCF2127=m CONFIG_RTC_DRV_PCF8523=m @@ -3322,6 +3359,7 @@ CONFIG_DMADEVICES=y # CONFIG_DW_DMAC is not set # CONFIG_TIMB_DMA is not set # CONFIG_PL330_DMA is not set +CONFIG_DMA_BCM2835=m CONFIG_DMA_BCM2708=m CONFIG_DMA_ENGINE=y CONFIG_DMA_VIRTUAL_CHANNELS=m @@ -3364,9 +3402,6 @@ CONFIG_88EU_P2P=y # CONFIG_LINE6_USB is not set # CONFIG_USB_SERIAL_QUATECH2 is not set # CONFIG_VT6656 is not set -CONFIG_ZSMALLOC=y -CONFIG_ZRAM=m -# CONFIG_ZRAM_DEBUG is not set # CONFIG_USB_ENESTORAGE is not set # CONFIG_BCM_WIMAX is not set # CONFIG_FT1000 is not set @@ -3379,8 +3414,11 @@ CONFIG_ZRAM=m # CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4 is not set CONFIG_STAGING_MEDIA=y # CONFIG_DVB_AS102 is not set +CONFIG_I2C_BCM2048=m # CONFIG_VIDEO_GO7007 is not set CONFIG_USB_MSI3101=m +# CONFIG_VIDEO_TCM825X is not set +CONFIG_USB_SN9C102=m CONFIG_LIRC_STAGING=y CONFIG_LIRC_IGORPLUGUSB=m CONFIG_LIRC_IMON=m @@ -3400,11 +3438,7 @@ CONFIG_USB_WPAN_HCD=m CONFIG_LTE_GDM724X=m # CONFIG_CED1401 is not set # CONFIG_DGRP is not set -CONFIG_USB_DWC2=m -# CONFIG_USB_DWC2_DEBUG is not set -# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set # CONFIG_LUSTRE_FS is not set -CONFIG_USB_BTMTK=m # CONFIG_DGAP is not set CONFIG_CLKDEV_LOOKUP=y @@ -3501,7 +3535,6 @@ CONFIG_QUOTACTL=y CONFIG_AUTOFS4_FS=y CONFIG_FUSE_FS=m CONFIG_CUSE=m -CONFIG_GENERIC_ACL=y # # Caches @@ -3770,6 +3803,7 @@ CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 # CONFIG_PANIC_ON_OOPS is not set CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 CONFIG_SCHED_DEBUG=y CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -3858,6 +3892,8 @@ CONFIG_PROBE_EVENTS=y # CONFIG_TEST_STRING_HELPERS is not set # CONFIG_TEST_KSTRTOX is not set # CONFIG_DMA_API_DEBUG is not set +# CONFIG_TEST_MODULE is not set +# CONFIG_TEST_USER_COPY is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y CONFIG_KGDB=y @@ -3866,6 +3902,7 @@ CONFIG_KGDB_SERIAL_CONSOLE=y CONFIG_KGDB_KDB=y CONFIG_KDB_KEYBOARD=y CONFIG_KDB_CONTINUE_CATASTROPHIC=0 +# CONFIG_ARM_PTDUMP is not set CONFIG_STRICT_DEVMEM=y CONFIG_ARM_UNWIND=y # CONFIG_DEBUG_USER is not set @@ -3877,6 +3914,7 @@ CONFIG_UNCOMPRESS_INCLUDE="mach/uncompress.h" # CONFIG_OC_ETM is not set # CONFIG_ARM_KPROBES_TEST is not set # CONFIG_PID_IN_CONTEXTIDR is not set +# CONFIG_DEBUG_SET_MODULE_RONX is not set # # Security options @@ -3984,7 +4022,7 @@ CONFIG_CRYPTO_XCBC=m # CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_CRC32 is not set -CONFIG_CRYPTO_CRCT10DIF=m +CONFIG_CRYPTO_CRCT10DIF=y # CONFIG_CRYPTO_GHASH is not set CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=y @@ -4058,7 +4096,7 @@ CONFIG_GENERIC_IO=y CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y CONFIG_CRC_CCITT=m CONFIG_CRC16=y -# CONFIG_CRC_T10DIF is not set +CONFIG_CRC_T10DIF=y CONFIG_CRC_ITU_T=y CONFIG_CRC32=y # CONFIG_CRC32_SELFTEST is not set diff --git a/core/linux-raspberrypi-latest/uksm-0.1.2.2-for-v3.13.patch b/core/linux-raspberrypi-latest/uksm-0.1.2.2-for-v3.13.patch deleted file mode 100644 index a13218c21..000000000 --- a/core/linux-raspberrypi-latest/uksm-0.1.2.2-for-v3.13.patch +++ /dev/null @@ -1,7064 +0,0 @@ -diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX -index a39d066..a03f094 100644 ---- a/Documentation/vm/00-INDEX -+++ b/Documentation/vm/00-INDEX -@@ -16,6 +16,8 @@ hwpoison.txt - - explains what hwpoison is - ksm.txt - - how to use the Kernel Samepage Merging feature. -+uksm.txt -+ - Introduction to Ultra KSM - locking - - info on how locking and synchronization is done in the Linux vm code. - numa -diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt -new file mode 100644 -index 0000000..9b2cb51 ---- /dev/null -+++ b/Documentation/vm/uksm.txt -@@ -0,0 +1,57 @@ -+The Ultra Kernel Samepage Merging feature -+---------------------------------------------- -+/* -+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia -+ * -+ * This is an improvement upon KSM. Some basic data structures and routines -+ * are borrowed from ksm.c . -+ * -+ * Its new features: -+ * 1. Full system scan: -+ * It automatically scans all user processes' anonymous VMAs. Kernel-user -+ * interaction to submit a memory area to KSM is no longer needed. -+ * -+ * 2. Rich area detection: -+ * It automatically detects rich areas containing abundant duplicated -+ * pages based. Rich areas are given a full scan speed. Poor areas are -+ * sampled at a reasonable speed with very low CPU consumption. -+ * -+ * 3. Ultra Per-page scan speed improvement: -+ * A new hash algorithm is proposed. As a result, on a machine with -+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it -+ * can scan memory areas that does not contain duplicated pages at speed of -+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of -+ * 477MB/sec ~ 923MB/sec. -+ * -+ * 4. Thrashing area avoidance: -+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be -+ * filtered out. My benchmark shows it's more efficient than KSM's per-page -+ * hash value based volatile page detection. -+ * -+ * -+ * 5. Misc changes upon KSM: -+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page -+ * comparison. It's much faster than default C version on x86. -+ * * rmap_item now has an struct *page member to loosely cache a -+ * address-->page mapping, which reduces too much time-costly -+ * follow_page(). -+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. -+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ -+ * ksm is needed for this case. -+ * -+ * 6. Full Zero Page consideration(contributed by Figo Zhang) -+ * Now uksmd consider full zero pages as special pages and merge them to an -+ * special unswappable uksm zero page. -+ */ -+ -+ChangeLog: -+ -+2012-05-05 The creation of this Doc -+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. -+2012-05-28 UKSM 0.1.1.2 bug fix release -+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 -+2012-07-2 UKSM 0.1.2-beta2 -+2012-07-10 UKSM 0.1.2-beta3 -+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. -+2012-10-13 UKSM 0.1.2.1 Bug fixes. -+2012-12-31 UKSM 0.1.2.2 Minor bug fixes -diff --git a/fs/exec.c b/fs/exec.c -index 7ea097f..ae44a37 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -19,7 +19,7 @@ - * current->executable is only used by the procfs. This allows a dispatch - * table to check for several different types of binary formats. We keep - * trying until we recognize the file or we run out of supported binary -- * formats. -+ * formats. - */ - - #include -@@ -55,6 +55,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1141,7 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm) - group */ - - current->self_exec_id++; -- -+ - flush_signal_handlers(current, 0); - do_close_on_exec(current->files); - } -@@ -1267,8 +1268,8 @@ static int check_unsafe_exec(struct linux_binprm *bprm) - return res; - } - --/* -- * Fill the binprm structure from the inode. -+/* -+ * Fill the binprm structure from the inode. - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes - * - * This may be called multiple times for binary chains (scripts for example). -diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c -index a77d2b2..d435567 100644 ---- a/fs/proc/meminfo.c -+++ b/fs/proc/meminfo.c -@@ -85,6 +85,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) - "SUnreclaim: %8lu kB\n" - "KernelStack: %8lu kB\n" - "PageTables: %8lu kB\n" -+#ifdef CONFIG_UKSM -+ "KsmZeroPages: %8lu kB\n" -+#endif - #ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" - #endif -@@ -138,6 +141,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) - K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, - K(global_page_state(NR_PAGETABLE)), -+#ifdef CONFIG_UKSM -+ K(global_page_state(NR_UKSM_ZERO_PAGES)), -+#endif - #ifdef CONFIG_QUICKLIST - K(quicklist_total_size()), - #endif -diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h -index db09234..b936093 100644 ---- a/include/asm-generic/pgtable.h -+++ b/include/asm-generic/pgtable.h -@@ -502,12 +502,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); - #endif - -+#ifdef CONFIG_UKSM -+static inline int is_uksm_zero_pfn(unsigned long pfn) -+{ -+ extern unsigned long uksm_zero_pfn; -+ return pfn == uksm_zero_pfn; -+} -+#else -+static inline int is_uksm_zero_pfn(unsigned long pfn) -+{ -+ return 0; -+} -+#endif -+ - #ifdef __HAVE_COLOR_ZERO_PAGE - static inline int is_zero_pfn(unsigned long pfn) - { - extern unsigned long zero_pfn; - unsigned long offset_from_zero_pfn = pfn - zero_pfn; -- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); -+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); - } - - #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) -@@ -516,7 +529,7 @@ static inline int is_zero_pfn(unsigned long pfn) - static inline int is_zero_pfn(unsigned long pfn) - { - extern unsigned long zero_pfn; -- return pfn == zero_pfn; -+ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); - } - - static inline unsigned long my_zero_pfn(unsigned long addr) -diff --git a/include/linux/ksm.h b/include/linux/ksm.h -index 45c9b6a..c7de7a7 100644 ---- a/include/linux/ksm.h -+++ b/include/linux/ksm.h -@@ -19,21 +19,6 @@ struct mem_cgroup; - #ifdef CONFIG_KSM - int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags); --int __ksm_enter(struct mm_struct *mm); --void __ksm_exit(struct mm_struct *mm); -- --static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) --{ -- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) -- return __ksm_enter(mm); -- return 0; --} -- --static inline void ksm_exit(struct mm_struct *mm) --{ -- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) -- __ksm_exit(mm); --} - - /* - * A KSM page is one of those write-protected "shared pages" or "merged pages" -@@ -80,6 +65,33 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg); - void ksm_migrate_page(struct page *newpage, struct page *oldpage); - -+#ifdef CONFIG_KSM_LEGACY -+int __ksm_enter(struct mm_struct *mm); -+void __ksm_exit(struct mm_struct *mm); -+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -+{ -+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) -+ return __ksm_enter(mm); -+ return 0; -+} -+ -+static inline void ksm_exit(struct mm_struct *mm) -+{ -+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) -+ __ksm_exit(mm); -+} -+ -+#elif defined(CONFIG_UKSM) -+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -+{ -+ return 0; -+} -+ -+static inline void ksm_exit(struct mm_struct *mm) -+{ -+} -+#endif /* !CONFIG_UKSM */ -+ - #else /* !CONFIG_KSM */ - - static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -@@ -132,4 +144,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) - #endif /* CONFIG_MMU */ - #endif /* !CONFIG_KSM */ - -+#include -+ - #endif /* __LINUX_KSM_H */ -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 290901a..a6d4935 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -307,6 +307,9 @@ struct vm_area_struct { - #ifdef CONFIG_NUMA - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ - #endif -+#ifdef CONFIG_UKSM -+ struct vma_slot *uksm_vma_slot; -+#endif - }; - - struct core_thread { -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index bd791e4..44bb9dd 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -144,6 +144,9 @@ enum zone_stat_item { - #endif - NR_ANON_TRANSPARENT_HUGEPAGES, - NR_FREE_CMA_PAGES, -+#ifdef CONFIG_UKSM -+ NR_UKSM_ZERO_PAGES, -+#endif - NR_VM_ZONE_STAT_ITEMS }; - - /* -@@ -870,7 +873,7 @@ static inline int is_highmem_idx(enum zone_type idx) - } - - /** -- * is_highmem - helper function to quickly check if a struct zone is a -+ * is_highmem - helper function to quickly check if a struct zone is a - * highmem zone or not. This is an attempt to keep references - * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. - * @zone - pointer to struct zone variable -diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h -new file mode 100644 -index 0000000..6780fdb ---- /dev/null -+++ b/include/linux/sradix-tree.h -@@ -0,0 +1,77 @@ -+#ifndef _LINUX_SRADIX_TREE_H -+#define _LINUX_SRADIX_TREE_H -+ -+ -+#define INIT_SRADIX_TREE(root, mask) \ -+do { \ -+ (root)->height = 0; \ -+ (root)->gfp_mask = (mask); \ -+ (root)->rnode = NULL; \ -+} while (0) -+ -+#define ULONG_BITS (sizeof(unsigned long) * 8) -+#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -+//#define SRADIX_TREE_MAP_SHIFT 6 -+//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) -+//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) -+ -+struct sradix_tree_node { -+ unsigned int height; /* Height from the bottom */ -+ unsigned int count; -+ unsigned int fulls; /* Number of full sublevel trees */ -+ struct sradix_tree_node *parent; -+ void *stores[0]; -+}; -+ -+/* A simple radix tree implementation */ -+struct sradix_tree_root { -+ unsigned int height; -+ struct sradix_tree_node *rnode; -+ -+ /* Where found to have available empty stores in its sublevels */ -+ struct sradix_tree_node *enter_node; -+ unsigned int shift; -+ unsigned int stores_size; -+ unsigned int mask; -+ unsigned long min; /* The first hole index */ -+ unsigned long num; -+ //unsigned long *height_to_maxindex; -+ -+ /* How the node is allocated and freed. */ -+ struct sradix_tree_node *(*alloc)(void); -+ void (*free)(struct sradix_tree_node *node); -+ -+ /* When a new node is added and removed */ -+ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); -+ void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); -+ void (*rm)(struct sradix_tree_node *node, unsigned offset); -+}; -+ -+struct sradix_tree_path { -+ struct sradix_tree_node *node; -+ int offset; -+}; -+ -+static inline -+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) -+{ -+ root->height = 0; -+ root->rnode = NULL; -+ root->shift = shift; -+ root->stores_size = 1UL << shift; -+ root->mask = root->stores_size - 1; -+} -+ -+ -+extern void *sradix_tree_next(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index, -+ int (*iter)(void *, unsigned long)); -+ -+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); -+ -+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index); -+ -+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); -+ -+#endif /* _LINUX_SRADIX_TREE_H */ -diff --git a/include/linux/uksm.h b/include/linux/uksm.h -new file mode 100644 -index 0000000..a644bca ---- /dev/null -+++ b/include/linux/uksm.h -@@ -0,0 +1,146 @@ -+#ifndef __LINUX_UKSM_H -+#define __LINUX_UKSM_H -+/* -+ * Memory merging support. -+ * -+ * This code enables dynamic sharing of identical pages found in different -+ * memory areas, even if they are not shared by fork(). -+ */ -+ -+/* if !CONFIG_UKSM this file should not be compiled at all. */ -+#ifdef CONFIG_UKSM -+ -+#include -+#include -+#include -+#include -+#include -+ -+extern unsigned long zero_pfn __read_mostly; -+extern unsigned long uksm_zero_pfn __read_mostly; -+extern struct page *empty_uksm_zero_page; -+ -+/* must be done before linked to mm */ -+extern void uksm_vma_add_new(struct vm_area_struct *vma); -+extern void uksm_remove_vma(struct vm_area_struct *vma); -+ -+#define UKSM_SLOT_NEED_SORT (1 << 0) -+#define UKSM_SLOT_NEED_RERAND (1 << 1) -+#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ -+#define UKSM_SLOT_FUL_SCANNED (1 << 3) -+#define UKSM_SLOT_IN_UKSM (1 << 4) -+ -+struct vma_slot { -+ struct sradix_tree_node *snode; -+ unsigned long sindex; -+ -+ struct list_head slot_list; -+ unsigned long fully_scanned_round; -+ unsigned long dedup_num; -+ unsigned long pages_scanned; -+ unsigned long last_scanned; -+ unsigned long pages_to_scan; -+ struct scan_rung *rung; -+ struct page **rmap_list_pool; -+ unsigned int *pool_counts; -+ unsigned long pool_size; -+ struct vm_area_struct *vma; -+ struct mm_struct *mm; -+ unsigned long ctime_j; -+ unsigned long pages; -+ unsigned long flags; -+ unsigned long pages_cowed; /* pages cowed this round */ -+ unsigned long pages_merged; /* pages merged this round */ -+ unsigned long pages_bemerged; -+ -+ /* when it has page merged in this eval round */ -+ struct list_head dedup_list; -+}; -+ -+static inline void uksm_unmap_zero_page(pte_t pte) -+{ -+ if (pte_pfn(pte) == uksm_zero_pfn) -+ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -+} -+ -+static inline void uksm_map_zero_page(pte_t pte) -+{ -+ if (pte_pfn(pte) == uksm_zero_pfn) -+ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -+} -+ -+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -+{ -+ if (vma->uksm_vma_slot && PageKsm(page)) -+ vma->uksm_vma_slot->pages_cowed++; -+} -+ -+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -+{ -+ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) -+ vma->uksm_vma_slot->pages_cowed++; -+} -+ -+static inline int uksm_flags_can_scan(unsigned long vm_flags) -+{ -+#ifndef VM_SAO -+#define VM_SAO 0 -+#endif -+ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | -+ VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP | -+ VM_SHARED | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO)); -+} -+ -+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -+{ -+ if (uksm_flags_can_scan(*vm_flags_p)) -+ *vm_flags_p |= VM_MERGEABLE; -+} -+ -+/* -+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will -+ * be removed when uksm zero page patch is stable enough. -+ */ -+static inline void uksm_bugon_zeropage(pte_t pte) -+{ -+ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); -+} -+#else -+static inline void uksm_vma_add_new(struct vm_area_struct *vma) -+{ -+} -+ -+static inline void uksm_remove_vma(struct vm_area_struct *vma) -+{ -+} -+ -+static inline void uksm_unmap_zero_page(pte_t pte) -+{ -+} -+ -+static inline void uksm_map_zero_page(pte_t pte) -+{ -+} -+ -+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -+{ -+} -+ -+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -+{ -+} -+ -+static inline int uksm_flags_can_scan(unsigned long vm_flags) -+{ -+ return 0; -+} -+ -+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -+{ -+} -+ -+static inline void uksm_bugon_zeropage(pte_t pte) -+{ -+} -+#endif /* !CONFIG_UKSM */ -+#endif /* __LINUX_UKSM_H */ -diff --git a/kernel/fork.c b/kernel/fork.c -index dfa736c..79a2187 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -394,7 +394,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) - goto fail_nomem; - charge = len; - } -- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); -+ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; -@@ -449,7 +449,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; -- -+ uksm_vma_add_new(tmp); - mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); - -diff --git a/lib/Makefile b/lib/Makefile -index a459c31..9630dae 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) - endif - - lib-y := ctype.o string.o vsprintf.o cmdline.o \ -- rbtree.o radix-tree.o dump_stack.o timerqueue.o\ -+ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ - idr.o int_sqrt.o extable.o \ - sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ - proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ -diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c -new file mode 100644 -index 0000000..8d06329 ---- /dev/null -+++ b/lib/sradix-tree.c -@@ -0,0 +1,476 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) -+{ -+ return node->fulls == root->stores_size || -+ (node->height == 1 && node->count == root->stores_size); -+} -+ -+/* -+ * Extend a sradix tree so it can store key @index. -+ */ -+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) -+{ -+ struct sradix_tree_node *node; -+ unsigned int height; -+ -+ if (unlikely(root->rnode == NULL)) { -+ if (!(node = root->alloc())) -+ return -ENOMEM; -+ -+ node->height = 1; -+ root->rnode = node; -+ root->height = 1; -+ } -+ -+ /* Figure out what the height should be. */ -+ height = root->height; -+ index >>= root->shift * height; -+ -+ while (index) { -+ index >>= root->shift; -+ height++; -+ } -+ -+ while (height > root->height) { -+ unsigned int newheight; -+ if (!(node = root->alloc())) -+ return -ENOMEM; -+ -+ /* Increase the height. */ -+ node->stores[0] = root->rnode; -+ root->rnode->parent = node; -+ if (root->extend) -+ root->extend(node, root->rnode); -+ -+ newheight = root->height + 1; -+ node->height = newheight; -+ node->count = 1; -+ if (sradix_node_full(root, root->rnode)) -+ node->fulls = 1; -+ -+ root->rnode = node; -+ root->height = newheight; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Search the next item from the current node, that is not NULL -+ * and can satify root->iter(). -+ */ -+void *sradix_tree_next(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index, -+ int (*iter)(void *item, unsigned long height)) -+{ -+ unsigned long offset; -+ void *item; -+ -+ if (unlikely(node == NULL)) { -+ node = root->rnode; -+ for (offset = 0; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (unlikely(offset >= root->stores_size)) -+ return NULL; -+ -+ if (node->height == 1) -+ return item; -+ else -+ goto go_down; -+ } -+ -+ while (node) { -+ offset = (index & root->mask) + 1; -+ for (;offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (offset < root->stores_size) -+ break; -+ -+ node = node->parent; -+ index >>= root->shift; -+ } -+ -+ if (!node) -+ return NULL; -+ -+ while (node->height > 1) { -+go_down: -+ node = item; -+ for (offset = 0; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (unlikely(offset >= root->stores_size)) -+ return NULL; -+ } -+ -+ BUG_ON(offset > root->stores_size); -+ -+ return item; -+} -+ -+/* -+ * Blindly insert the item to the tree. Typically, we reuse the -+ * first empty store item. -+ */ -+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) -+{ -+ unsigned long index; -+ unsigned int height; -+ struct sradix_tree_node *node, *tmp = NULL; -+ int offset, offset_saved; -+ void **store = NULL; -+ int error, i, j, shift; -+ -+go_on: -+ index = root->min; -+ -+ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { -+ node = root->enter_node; -+ BUG_ON((index >> (root->shift * root->height))); -+ } else { -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height)) -+ || sradix_node_full(root, node)) { -+ error = sradix_tree_extend(root, index); -+ if (error) -+ return error; -+ -+ node = root->rnode; -+ } -+ } -+ -+ -+ height = node->height; -+ shift = (height - 1) * root->shift; -+ offset = (index >> shift) & root->mask; -+ while (shift > 0) { -+ offset_saved = offset; -+ for (; offset < root->stores_size; offset++) { -+ store = &node->stores[offset]; -+ tmp = *store; -+ -+ if (!tmp || !sradix_node_full(root, tmp)) -+ break; -+ } -+ BUG_ON(offset >= root->stores_size); -+ -+ if (offset != offset_saved) { -+ index += (offset - offset_saved) << shift; -+ index &= ~((1UL << shift) - 1); -+ } -+ -+ if (!tmp) { -+ if (!(tmp = root->alloc())) -+ return -ENOMEM; -+ -+ tmp->height = shift / root->shift; -+ *store = tmp; -+ tmp->parent = node; -+ node->count++; -+// if (root->extend) -+// root->extend(node, tmp); -+ } -+ -+ node = tmp; -+ shift -= root->shift; -+ offset = (index >> shift) & root->mask; -+ } -+ -+ BUG_ON(node->height != 1); -+ -+ -+ store = &node->stores[offset]; -+ for (i = 0, j = 0; -+ j < root->stores_size - node->count && -+ i < root->stores_size - offset && j < num; i++) { -+ if (!store[i]) { -+ store[i] = item[j]; -+ if (root->assign) -+ root->assign(node, index + i, item[j]); -+ j++; -+ } -+ } -+ -+ node->count += j; -+ root->num += j; -+ num -= j; -+ -+ while (sradix_node_full(root, node)) { -+ node = node->parent; -+ if (!node) -+ break; -+ -+ node->fulls++; -+ } -+ -+ if (unlikely(!node)) { -+ /* All nodes are full */ -+ root->min = 1 << (root->height * root->shift); -+ root->enter_node = NULL; -+ } else { -+ root->min = index + i - 1; -+ root->min |= (1UL << (node->height - 1)) - 1; -+ root->min++; -+ root->enter_node = node; -+ } -+ -+ if (num) { -+ item += j; -+ goto go_on; -+ } -+ -+ return 0; -+} -+ -+ -+/** -+ * sradix_tree_shrink - shrink height of a sradix tree to minimal -+ * @root sradix tree root -+ * -+ */ -+static inline void sradix_tree_shrink(struct sradix_tree_root *root) -+{ -+ /* try to shrink tree height */ -+ while (root->height > 1) { -+ struct sradix_tree_node *to_free = root->rnode; -+ -+ /* -+ * The candidate node has more than one child, or its child -+ * is not at the leftmost store, we cannot shrink. -+ */ -+ if (to_free->count != 1 || !to_free->stores[0]) -+ break; -+ -+ root->rnode = to_free->stores[0]; -+ root->rnode->parent = NULL; -+ root->height--; -+ if (unlikely(root->enter_node == to_free)) { -+ root->enter_node = NULL; -+ } -+ root->free(to_free); -+ } -+} -+ -+/* -+ * Del the item on the known leaf node and index -+ */ -+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index) -+{ -+ unsigned int offset; -+ struct sradix_tree_node *start, *end; -+ -+ BUG_ON(node->height != 1); -+ -+ start = node; -+ while (node && !(--node->count)) -+ node = node->parent; -+ -+ end = node; -+ if (!node) { -+ root->rnode = NULL; -+ root->height = 0; -+ root->min = 0; -+ root->num = 0; -+ root->enter_node = NULL; -+ } else { -+ offset = (index >> (root->shift * (node->height - 1))) & root->mask; -+ if (root->rm) -+ root->rm(node, offset); -+ node->stores[offset] = NULL; -+ root->num--; -+ if (root->min > index) { -+ root->min = index; -+ root->enter_node = node; -+ } -+ } -+ -+ if (start != end) { -+ do { -+ node = start; -+ start = start->parent; -+ if (unlikely(root->enter_node == node)) -+ root->enter_node = end; -+ root->free(node); -+ } while (start != end); -+ -+ /* -+ * Note that shrink may free "end", so enter_node still need to -+ * be checked inside. -+ */ -+ sradix_tree_shrink(root); -+ } else if (node->count == root->stores_size - 1) { -+ /* It WAS a full leaf node. Update the ancestors */ -+ node = node->parent; -+ while (node) { -+ node->fulls--; -+ if (node->fulls != root->stores_size - 1) -+ break; -+ -+ node = node->parent; -+ } -+ } -+} -+ -+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node; -+ int shift; -+ -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height))) -+ return NULL; -+ -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ node = node->stores[offset]; -+ if (!node) -+ return NULL; -+ -+ shift -= root->shift; -+ } while (shift >= 0); -+ -+ return node; -+} -+ -+/* -+ * Return the item if it exists, otherwise create it in place -+ * and return the created item. -+ */ -+void *sradix_tree_lookup_create(struct sradix_tree_root *root, -+ unsigned long index, void *(*item_alloc)(void)) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node, *tmp; -+ void *item; -+ int shift, error; -+ -+ if (root->rnode == NULL || (index >> (root->shift * root->height))) { -+ if (item_alloc) { -+ error = sradix_tree_extend(root, index); -+ if (error) -+ return NULL; -+ } else { -+ return NULL; -+ } -+ } -+ -+ node = root->rnode; -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ if (!node->stores[offset]) { -+ if (!(tmp = root->alloc())) -+ return NULL; -+ -+ tmp->height = shift / root->shift; -+ node->stores[offset] = tmp; -+ tmp->parent = node; -+ node->count++; -+ node = tmp; -+ } else { -+ node = node->stores[offset]; -+ } -+ -+ shift -= root->shift; -+ } while (shift > 0); -+ -+ BUG_ON(node->height != 1); -+ offset = index & root->mask; -+ if (node->stores[offset]) { -+ return node->stores[offset]; -+ } else if (item_alloc) { -+ if (!(item = item_alloc())) -+ return NULL; -+ -+ node->stores[offset] = item; -+ -+ /* -+ * NOTE: we do NOT call root->assign here, since this item is -+ * newly created by us having no meaning. Caller can call this -+ * if it's necessary to do so. -+ */ -+ -+ node->count++; -+ root->num++; -+ -+ while (sradix_node_full(root, node)) { -+ node = node->parent; -+ if (!node) -+ break; -+ -+ node->fulls++; -+ } -+ -+ if (unlikely(!node)) { -+ /* All nodes are full */ -+ root->min = 1 << (root->height * root->shift); -+ } else { -+ if (root->min == index) { -+ root->min |= (1UL << (node->height - 1)) - 1; -+ root->min++; -+ root->enter_node = node; -+ } -+ } -+ -+ return item; -+ } else { -+ return NULL; -+ } -+ -+} -+ -+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node; -+ int shift; -+ -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height))) -+ return -ENOENT; -+ -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ node = node->stores[offset]; -+ if (!node) -+ return -ENOENT; -+ -+ shift -= root->shift; -+ } while (shift > 0); -+ -+ offset = index & root->mask; -+ if (!node->stores[offset]) -+ return -ENOENT; -+ -+ sradix_tree_delete_from_leaf(root, node, index); -+ -+ return 0; -+} -diff --git a/mm/Kconfig b/mm/Kconfig -index 723bbe0..da46fbd 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -324,6 +324,32 @@ config KSM - See Documentation/vm/ksm.txt for more information: KSM is inactive - until a program has madvised that an area is MADV_MERGEABLE, and - root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). -+choice -+ prompt "Choose UKSM/KSM strategy" -+ default UKSM -+ depends on KSM -+ help -+ This option allows to select a UKSM/KSM stragety. -+ -+config UKSM -+ bool "Ultra-KSM for page merging" -+ depends on KSM -+ help -+ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same -+ page Merging), but with a fundamentally rewritten core algorithm. With -+ an advanced algorithm, UKSM now can transparently scans all anonymously -+ mapped user space applications with an significantly improved scan speed -+ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from -+ UKSM. Now UKSM has its first stable release and first real world enterprise user. -+ For more information, please goto its project page. -+ (www.kerneldedup.org) -+ -+config KSM_LEGACY -+ bool "Legacy KSM implementation" -+ depends on KSM -+ help -+ The legacy KSM implementation from Redhat. -+endchoice - - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" -diff --git a/mm/Makefile b/mm/Makefile -index 305d10a..b5e5920 100644 ---- a/mm/Makefile -+++ b/mm/Makefile -@@ -40,7 +40,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o - obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o - obj-$(CONFIG_SLOB) += slob.o - obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o --obj-$(CONFIG_KSM) += ksm.o -+obj-$(CONFIG_KSM_LEGACY) += ksm.o -+obj-$(CONFIG_UKSM) += uksm.o - obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o - obj-$(CONFIG_SLAB) += slab.o - obj-$(CONFIG_SLUB) += slub.o -diff --git a/mm/memory.c b/mm/memory.c -index 6768ce9..146ef7d 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -116,6 +116,27 @@ __setup("norandmaps", disable_randmaps); - unsigned long zero_pfn __read_mostly; - unsigned long highest_memmap_pfn __read_mostly; - -+#ifdef CONFIG_UKSM -+unsigned long uksm_zero_pfn __read_mostly; -+struct page *empty_uksm_zero_page; -+ -+static int __init setup_uksm_zero_page(void) -+{ -+ unsigned long addr; -+ addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); -+ if (!addr) -+ panic("Oh boy, that early out of memory?"); -+ -+ empty_uksm_zero_page = virt_to_page((void *) addr); -+ SetPageReserved(empty_uksm_zero_page); -+ -+ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); -+ -+ return 0; -+} -+core_initcall(setup_uksm_zero_page); -+#endif -+ - /* - * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() - */ -@@ -127,6 +148,7 @@ static int __init init_zero_pfn(void) - core_initcall(init_zero_pfn); - - -+ - #if defined(SPLIT_RSS_COUNTING) - - void sync_mm_rss(struct mm_struct *mm) -@@ -870,6 +892,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; -+ -+ /* Should return NULL in vm_normal_page() */ -+ uksm_bugon_zeropage(pte); -+ } else { -+ uksm_map_zero_page(pte); - } - - out_set_pte: -@@ -1112,8 +1139,10 @@ again: - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - tlb_remove_tlb_entry(tlb, pte, addr); -- if (unlikely(!page)) -+ if (unlikely(!page)) { -+ uksm_unmap_zero_page(ptent); - continue; -+ } - if (unlikely(details) && details->nonlinear_vma - && linear_page_index(details->nonlinear_vma, - addr) != page->index) { -@@ -1704,7 +1733,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - -- /* -+ /* - * Require read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ -@@ -1764,7 +1793,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && -- is_zero_pfn(pte_pfn(*pte))) -+ (is_zero_pfn(pte_pfn(*pte)))) - page = pte_page(*pte); - else { - pte_unmap(pte); -@@ -2579,8 +2608,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo - clear_page(kaddr); - kunmap_atomic(kaddr); - flush_dcache_page(dst); -- } else -+ } else { - copy_user_highpage(dst, src, va, vma); -+ uksm_cow_page(vma, src); -+ } - } - - /* -@@ -2787,6 +2818,7 @@ gotten: - new_page = alloc_zeroed_user_highpage_movable(vma, address); - if (!new_page) - goto oom; -+ uksm_cow_pte(vma, orig_pte); - } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) -@@ -2812,8 +2844,11 @@ gotten: - dec_mm_counter_fast(mm, MM_FILEPAGES); - inc_mm_counter_fast(mm, MM_ANONPAGES); - } -- } else -+ uksm_bugon_zeropage(orig_pte); -+ } else { -+ uksm_unmap_zero_page(orig_pte); - inc_mm_counter_fast(mm, MM_ANONPAGES); -+ } - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = mk_pte(new_page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); -diff --git a/mm/mmap.c b/mm/mmap.c -index 834b2d7..9211c9b 100644 ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -65,7 +66,7 @@ static void unmap_region(struct mm_struct *mm, - * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (yes) yes w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes -- * -+ * - * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (copy) copy w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes -@@ -250,6 +251,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) - if (vma->vm_file) - fput(vma->vm_file); - mpol_put(vma_policy(vma)); -+ uksm_remove_vma(vma); - kmem_cache_free(vm_area_cachep, vma); - return next; - } -@@ -705,9 +707,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, - long adjust_next = 0; - int remove_next = 0; - -+/* -+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is -+ * acquired -+ */ -+ uksm_remove_vma(vma); -+ - if (next && !insert) { - struct vm_area_struct *exporter = NULL; - -+ uksm_remove_vma(next); - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and -@@ -801,6 +810,7 @@ again: remove_next = 1 + (end > next->vm_end); - end_changed = true; - } - vma->vm_pgoff = pgoff; -+ - if (adjust_next) { - next->vm_start += adjust_next << PAGE_SHIFT; - next->vm_pgoff += adjust_next; -@@ -871,16 +881,22 @@ again: remove_next = 1 + (end > next->vm_end); - * up the code too much to do both in one go. - */ - next = vma->vm_next; -- if (remove_next == 2) -+ if (remove_next == 2) { -+ uksm_remove_vma(next); - goto again; -- else if (next) -+ } else if (next) { - vma_gap_update(next); -- else -+ } else { - mm->highest_vm_end = end; -+ } -+ } else { -+ if (next && !insert) -+ uksm_vma_add_new(next); - } - if (insert && file) - uprobe_mmap(insert); - -+ uksm_vma_add_new(vma); - validate_mm(mm); - - return 0; -@@ -1247,6 +1263,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - -+ /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ -+ uksm_vm_flags_mod(&vm_flags); -+ - if (flags & MAP_LOCKED) - if (!can_do_mlock()) - return -EPERM; -@@ -1591,6 +1610,7 @@ munmap_back: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); - file = vma->vm_file; -+ uksm_vma_add_new(vma); - out: - perf_event_mmap(vma); - -@@ -1627,6 +1647,7 @@ unmap_and_free_vma: - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; - free_vma: -+ uksm_remove_vma(vma); - kmem_cache_free(vm_area_cachep, vma); - unacct_error: - if (charged) -@@ -1875,7 +1896,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, - info.align_mask = 0; - return vm_unmapped_area(&info); - } --#endif -+#endif - - /* - * This mmap-allocator allocates new areas top-down from below the -@@ -2421,6 +2442,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, - else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - -+ uksm_vma_add_new(new); -+ - /* Success. */ - if (!err) - return 0; -@@ -2586,6 +2609,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) - return addr; - - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; -+ uksm_vm_flags_mod(&flags); - - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (error & ~PAGE_MASK) -@@ -2653,6 +2677,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) - vma->vm_flags = flags; - vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); -+ uksm_vma_add_new(vma); - out: - perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; -@@ -2688,6 +2713,12 @@ void exit_mmap(struct mm_struct *mm) - /* mm's last user has gone, and its about to be pulled down */ - mmu_notifier_release(mm); - -+ /* -+ * Taking write lock on mmap_sem does not harm others, -+ * but it's crucial for uksm to avoid races. -+ */ -+ down_write(&mm->mmap_sem); -+ - if (mm->locked_vm) { - vma = mm->mmap; - while (vma) { -@@ -2724,6 +2755,11 @@ void exit_mmap(struct mm_struct *mm) - } - vm_unacct_memory(nr_accounted); - -+ mm->mmap = NULL; -+ mm->mm_rb = RB_ROOT; -+ mm->mmap_cache = NULL; -+ up_write(&mm->mmap_sem); -+ - WARN_ON(atomic_long_read(&mm->nr_ptes) > - (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); - } -@@ -2832,6 +2868,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); - *need_rmap_locks = false; -+ uksm_vma_add_new(new_vma); - } - } - return new_vma; -@@ -2933,10 +2970,10 @@ int install_special_mapping(struct mm_struct *mm, - ret = insert_vm_struct(mm, vma); - if (ret) - goto out; -- - mm->total_vm += len >> PAGE_SHIFT; - - perf_event_mmap(vma); -+ uksm_vma_add_new(vma); - - return 0; - -diff --git a/mm/rmap.c b/mm/rmap.c -index 068522d..7552b6d 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -971,9 +971,9 @@ void page_move_anon_rmap(struct page *page, - - /** - * __page_set_anon_rmap - set up new anonymous rmap -- * @page: Page to add to rmap -+ * @page: Page to add to rmap - * @vma: VM area to add page to. -- * @address: User virtual address of the mapping -+ * @address: User virtual address of the mapping - * @exclusive: the page is exclusively owned by the current process - */ - static void __page_set_anon_rmap(struct page *page, -diff --git a/mm/uksm.c b/mm/uksm.c -new file mode 100644 -index 0000000..794867a ---- /dev/null -+++ b/mm/uksm.c -@@ -0,0 +1,5640 @@ -+/* -+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia -+ * -+ * This is an improvement upon KSM. Some basic data structures and routines -+ * are borrowed from ksm.c . -+ * -+ * Its new features: -+ * 1. Full system scan: -+ * It automatically scans all user processes' anonymous VMAs. Kernel-user -+ * interaction to submit a memory area to KSM is no longer needed. -+ * -+ * 2. Rich area detection: -+ * It automatically detects rich areas containing abundant duplicated -+ * pages based. Rich areas are given a full scan speed. Poor areas are -+ * sampled at a reasonable speed with very low CPU consumption. -+ * -+ * 3. Ultra Per-page scan speed improvement: -+ * A new hash algorithm is proposed. As a result, on a machine with -+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it -+ * can scan memory areas that does not contain duplicated pages at speed of -+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of -+ * 477MB/sec ~ 923MB/sec. -+ * -+ * 4. Thrashing area avoidance: -+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be -+ * filtered out. My benchmark shows it's more efficient than KSM's per-page -+ * hash value based volatile page detection. -+ * -+ * -+ * 5. Misc changes upon KSM: -+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page -+ * comparison. It's much faster than default C version on x86. -+ * * rmap_item now has an struct *page member to loosely cache a -+ * address-->page mapping, which reduces too much time-costly -+ * follow_page(). -+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. -+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ -+ * ksm is needed for this case. -+ * -+ * 6. Full Zero Page consideration(contributed by Figo Zhang) -+ * Now uksmd consider full zero pages as special pages and merge them to an -+ * special unswappable uksm zero page. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include "internal.h" -+ -+#ifdef CONFIG_X86 -+#undef memcmp -+ -+#ifdef CONFIG_X86_32 -+#define memcmp memcmpx86_32 -+/* -+ * Compare 4-byte-aligned address s1 and s2, with length n -+ */ -+int memcmpx86_32(void *s1, void *s2, size_t n) -+{ -+ size_t num = n / 4; -+ register int res; -+ -+ __asm__ __volatile__ -+ ( -+ "testl %3,%3\n\t" -+ "repe; cmpsd\n\t" -+ "je 1f\n\t" -+ "sbbl %0,%0\n\t" -+ "orl $1,%0\n" -+ "1:" -+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) -+ : "0" (0) -+ : "cc"); -+ -+ return res; -+} -+ -+/* -+ * Check the page is all zero ? -+ */ -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned char same; -+ -+ len /= 4; -+ -+ __asm__ __volatile__ -+ ("repe; scasl;" -+ "sete %0" -+ : "=qm" (same), "+D" (s1), "+c" (len) -+ : "a" (0) -+ : "cc"); -+ -+ return same; -+} -+ -+ -+#elif defined(CONFIG_X86_64) -+#define memcmp memcmpx86_64 -+/* -+ * Compare 8-byte-aligned address s1 and s2, with length n -+ */ -+int memcmpx86_64(void *s1, void *s2, size_t n) -+{ -+ size_t num = n / 8; -+ register int res; -+ -+ __asm__ __volatile__ -+ ( -+ "testq %q3,%q3\n\t" -+ "repe; cmpsq\n\t" -+ "je 1f\n\t" -+ "sbbq %q0,%q0\n\t" -+ "orq $1,%q0\n" -+ "1:" -+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) -+ : "0" (0) -+ : "cc"); -+ -+ return res; -+} -+ -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned char same; -+ -+ len /= 8; -+ -+ __asm__ __volatile__ -+ ("repe; scasq;" -+ "sete %0" -+ : "=qm" (same), "+D" (s1), "+c" (len) -+ : "a" (0) -+ : "cc"); -+ -+ return same; -+} -+ -+#endif -+#else -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned long *src = s1; -+ int i; -+ -+ len /= sizeof(*src); -+ -+ for (i = 0; i < len; i++) { -+ if (src[i]) -+ return 0; -+ } -+ -+ return 1; -+} -+#endif -+ -+#define U64_MAX (~((u64)0)) -+#define UKSM_RUNG_ROUND_FINISHED (1 << 0) -+#define TIME_RATIO_SCALE 10000 -+ -+#define SLOT_TREE_NODE_SHIFT 8 -+#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) -+struct slot_tree_node { -+ unsigned long size; -+ struct sradix_tree_node snode; -+ void *stores[SLOT_TREE_NODE_STORE_SIZE]; -+}; -+ -+static struct kmem_cache *slot_tree_node_cachep; -+ -+static struct sradix_tree_node *slot_tree_node_alloc(void) -+{ -+ struct slot_tree_node *p; -+ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); -+ if (!p) -+ return NULL; -+ -+ return &p->snode; -+} -+ -+static void slot_tree_node_free(struct sradix_tree_node *node) -+{ -+ struct slot_tree_node *p; -+ -+ p = container_of(node, struct slot_tree_node, snode); -+ kmem_cache_free(slot_tree_node_cachep, p); -+} -+ -+static void slot_tree_node_extend(struct sradix_tree_node *parent, -+ struct sradix_tree_node *child) -+{ -+ struct slot_tree_node *p, *c; -+ -+ p = container_of(parent, struct slot_tree_node, snode); -+ c = container_of(child, struct slot_tree_node, snode); -+ -+ p->size += c->size; -+} -+ -+void slot_tree_node_assign(struct sradix_tree_node *node, -+ unsigned index, void *item) -+{ -+ struct vma_slot *slot = item; -+ struct slot_tree_node *cur; -+ -+ slot->snode = node; -+ slot->sindex = index; -+ -+ while (node) { -+ cur = container_of(node, struct slot_tree_node, snode); -+ cur->size += slot->pages; -+ node = node->parent; -+ } -+} -+ -+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) -+{ -+ struct vma_slot *slot; -+ struct slot_tree_node *cur; -+ unsigned long pages; -+ -+ if (node->height == 1) { -+ slot = node->stores[offset]; -+ pages = slot->pages; -+ } else { -+ cur = container_of(node->stores[offset], -+ struct slot_tree_node, snode); -+ pages = cur->size; -+ } -+ -+ while (node) { -+ cur = container_of(node, struct slot_tree_node, snode); -+ cur->size -= pages; -+ node = node->parent; -+ } -+} -+ -+unsigned long slot_iter_index; -+int slot_iter(void *item, unsigned long height) -+{ -+ struct slot_tree_node *node; -+ struct vma_slot *slot; -+ -+ if (height == 1) { -+ slot = item; -+ if (slot_iter_index < slot->pages) { -+ /*in this one*/ -+ return 1; -+ } else { -+ slot_iter_index -= slot->pages; -+ return 0; -+ } -+ -+ } else { -+ node = container_of(item, struct slot_tree_node, snode); -+ if (slot_iter_index < node->size) { -+ /*in this one*/ -+ return 1; -+ } else { -+ slot_iter_index -= node->size; -+ return 0; -+ } -+ } -+} -+ -+ -+static inline void slot_tree_init_root(struct sradix_tree_root *root) -+{ -+ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); -+ root->alloc = slot_tree_node_alloc; -+ root->free = slot_tree_node_free; -+ root->extend = slot_tree_node_extend; -+ root->assign = slot_tree_node_assign; -+ root->rm = slot_tree_node_rm; -+} -+ -+void slot_tree_init(void) -+{ -+ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", -+ sizeof(struct slot_tree_node), 0, -+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, -+ NULL); -+} -+ -+ -+/* Each rung of this ladder is a list of VMAs having a same scan ratio */ -+struct scan_rung { -+ //struct list_head scanned_list; -+ struct sradix_tree_root vma_root; -+ struct sradix_tree_root vma_root2; -+ -+ struct vma_slot *current_scan; -+ unsigned long current_offset; -+ -+ /* -+ * The initial value for current_offset, it should loop over -+ * [0~ step - 1] to let all slot have its chance to be scanned. -+ */ -+ unsigned long offset_init; -+ unsigned long step; /* dynamic step for current_offset */ -+ unsigned int flags; -+ unsigned long pages_to_scan; -+ //unsigned long fully_scanned_slots; -+ /* -+ * a little bit tricky - if cpu_time_ratio > 0, then the value is the -+ * the cpu time ratio it can spend in rung_i for every scan -+ * period. if < 0, then it is the cpu time ratio relative to the -+ * max cpu percentage user specified. Both in unit of -+ * 1/TIME_RATIO_SCALE -+ */ -+ int cpu_ratio; -+ -+ /* -+ * How long it will take for all slots in this rung to be fully -+ * scanned? If it's zero, we don't care about the cover time: -+ * it's fully scanned. -+ */ -+ unsigned int cover_msecs; -+ //unsigned long vma_num; -+ //unsigned long pages; /* Sum of all slot's pages in rung */ -+}; -+ -+/** -+ * node of either the stable or unstale rbtree -+ * -+ */ -+struct tree_node { -+ struct rb_node node; /* link in the main (un)stable rbtree */ -+ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ -+ u32 hash; -+ unsigned long count; /* TODO: merged with sub_root */ -+ struct list_head all_list; /* all tree nodes in stable/unstable tree */ -+}; -+ -+/** -+ * struct stable_node - node of the stable rbtree -+ * @node: rb node of this ksm page in the stable tree -+ * @hlist: hlist head of rmap_items using this ksm page -+ * @kpfn: page frame number of this ksm page -+ */ -+struct stable_node { -+ struct rb_node node; /* link in sub-rbtree */ -+ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ -+ struct hlist_head hlist; -+ unsigned long kpfn; -+ u32 hash_max; /* if ==0 then it's not been calculated yet */ -+ struct list_head all_list; /* in a list for all stable nodes */ -+}; -+ -+/** -+ * struct node_vma - group rmap_items linked in a same stable -+ * node together. -+ */ -+struct node_vma { -+ union { -+ struct vma_slot *slot; -+ unsigned long key; /* slot is used as key sorted on hlist */ -+ }; -+ struct hlist_node hlist; -+ struct hlist_head rmap_hlist; -+ struct stable_node *head; -+}; -+ -+/** -+ * struct rmap_item - reverse mapping item for virtual addresses -+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list -+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree -+ * @mm: the memory structure this rmap_item is pointing into -+ * @address: the virtual address this rmap_item tracks (+ flags in low bits) -+ * @node: rb node of this rmap_item in the unstable tree -+ * @head: pointer to stable_node heading this list in the stable tree -+ * @hlist: link into hlist of rmap_items hanging off that stable_node -+ */ -+struct rmap_item { -+ struct vma_slot *slot; -+ struct page *page; -+ unsigned long address; /* + low bits used for flags below */ -+ unsigned long hash_round; -+ unsigned long entry_index; -+ union { -+ struct {/* when in unstable tree */ -+ struct rb_node node; -+ struct tree_node *tree_node; -+ u32 hash_max; -+ }; -+ struct { /* when in stable tree */ -+ struct node_vma *head; -+ struct hlist_node hlist; -+ struct anon_vma *anon_vma; -+ }; -+ }; -+} __attribute__((aligned(4))); -+ -+struct rmap_list_entry { -+ union { -+ struct rmap_item *item; -+ unsigned long addr; -+ }; -+ /* lowest bit is used for is_addr tag */ -+} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ -+ -+ -+/* Basic data structure definition ends */ -+ -+ -+/* -+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. -+ * The flags use the low bits of rmap_item.address -+ */ -+#define UNSTABLE_FLAG 0x1 -+#define STABLE_FLAG 0x2 -+#define get_rmap_addr(x) ((x)->address & PAGE_MASK) -+ -+/* -+ * rmap_list_entry helpers -+ */ -+#define IS_ADDR_FLAG 1 -+#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) -+#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) -+#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) -+ -+ -+/* -+ * High speed caches for frequently allocated and freed structs -+ */ -+static struct kmem_cache *rmap_item_cache; -+static struct kmem_cache *stable_node_cache; -+static struct kmem_cache *node_vma_cache; -+static struct kmem_cache *vma_slot_cache; -+static struct kmem_cache *tree_node_cache; -+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ -+ sizeof(struct __struct), __alignof__(struct __struct),\ -+ (__flags), NULL) -+ -+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ -+#define SCAN_LADDER_SIZE 4 -+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; -+ -+/* The evaluation rounds uksmd has finished */ -+static unsigned long long uksm_eval_round = 1; -+ -+/* -+ * we add 1 to this var when we consider we should rebuild the whole -+ * unstable tree. -+ */ -+static unsigned long uksm_hash_round = 1; -+ -+/* -+ * How many times the whole memory is scanned. -+ */ -+static unsigned long long fully_scanned_round = 1; -+ -+/* The total number of virtual pages of all vma slots */ -+static u64 uksm_pages_total; -+ -+/* The number of pages has been scanned since the start up */ -+static u64 uksm_pages_scanned; -+ -+static u64 scanned_virtual_pages; -+ -+/* The number of pages has been scanned since last encode_benefit call */ -+static u64 uksm_pages_scanned_last; -+ -+/* If the scanned number is tooo large, we encode it here */ -+static u64 pages_scanned_stored; -+ -+static unsigned long pages_scanned_base; -+ -+/* The number of nodes in the stable tree */ -+static unsigned long uksm_pages_shared; -+ -+/* The number of page slots additionally sharing those nodes */ -+static unsigned long uksm_pages_sharing; -+ -+/* The number of nodes in the unstable tree */ -+static unsigned long uksm_pages_unshared; -+ -+/* -+ * Milliseconds ksmd should sleep between scans, -+ * >= 100ms to be consistent with -+ * scan_time_to_sleep_msec() -+ */ -+static unsigned int uksm_sleep_jiffies; -+ -+/* The real value for the uksmd next sleep */ -+static unsigned int uksm_sleep_real; -+ -+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ -+static unsigned int uksm_sleep_saved; -+ -+/* Max percentage of cpu utilization ksmd can take to scan in one batch */ -+static unsigned int uksm_max_cpu_percentage; -+ -+static int uksm_cpu_governor; -+ -+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; -+ -+struct uksm_cpu_preset_s { -+ int cpu_ratio[SCAN_LADDER_SIZE]; -+ unsigned int cover_msecs[SCAN_LADDER_SIZE]; -+ unsigned int max_cpu; /* percentage */ -+}; -+ -+struct uksm_cpu_preset_s uksm_cpu_preset[4] = { -+ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, -+ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, -+ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, -+ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, -+}; -+ -+/* The default value for uksm_ema_page_time if it's not initialized */ -+#define UKSM_PAGE_TIME_DEFAULT 500 -+ -+/*cost to scan one page by expotional moving average in nsecs */ -+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; -+ -+/* The expotional moving average alpha weight, in percentage. */ -+#define EMA_ALPHA 20 -+ -+/* -+ * The threshold used to filter out thrashing areas, -+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound -+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio -+ * will be considered as having a zero duplication ratio. -+ */ -+static unsigned int uksm_thrash_threshold = 50; -+ -+/* How much dedup ratio is considered to be abundant*/ -+static unsigned int uksm_abundant_threshold = 10; -+ -+/* All slots having merged pages in this eval round. */ -+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); -+ -+/* How many times the ksmd has slept since startup */ -+static unsigned long long uksm_sleep_times; -+ -+#define UKSM_RUN_STOP 0 -+#define UKSM_RUN_MERGE 1 -+static unsigned int uksm_run = 1; -+ -+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); -+static DEFINE_MUTEX(uksm_thread_mutex); -+ -+/* -+ * List vma_slot_new is for newly created vma_slot waiting to be added by -+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to -+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding -+ * VMA has been removed/freed. -+ */ -+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); -+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); -+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); -+static DEFINE_SPINLOCK(vma_slot_list_lock); -+ -+/* The unstable tree heads */ -+static struct rb_root root_unstable_tree = RB_ROOT; -+ -+/* -+ * All tree_nodes are in a list to be freed at once when unstable tree is -+ * freed after each scan round. -+ */ -+static struct list_head unstable_tree_node_list = -+ LIST_HEAD_INIT(unstable_tree_node_list); -+ -+/* List contains all stable nodes */ -+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); -+ -+/* -+ * When the hash strength is changed, the stable tree must be delta_hashed and -+ * re-structured. We use two set of below structs to speed up the -+ * re-structuring of stable tree. -+ */ -+static struct list_head -+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), -+ LIST_HEAD_INIT(stable_tree_node_list[1])}; -+ -+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; -+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; -+static struct rb_root *root_stable_treep = &root_stable_tree[0]; -+static unsigned long stable_tree_index; -+ -+/* The hash strength needed to hash a full page */ -+#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) -+ -+/* The hash strength needed for loop-back hashing */ -+#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) -+ -+/* The random offsets in a page */ -+static u32 *random_nums; -+ -+/* The hash strength */ -+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; -+ -+/* The delta value each time the hash strength increases or decreases */ -+static unsigned long hash_strength_delta; -+#define HASH_STRENGTH_DELTA_MAX 5 -+ -+/* The time we have saved due to random_sample_hash */ -+static u64 rshash_pos; -+ -+/* The time we have wasted due to hash collision */ -+static u64 rshash_neg; -+ -+struct uksm_benefit { -+ u64 pos; -+ u64 neg; -+ u64 scanned; -+ unsigned long base; -+} benefit; -+ -+/* -+ * The relative cost of memcmp, compared to 1 time unit of random sample -+ * hash, this value is tested when ksm module is initialized -+ */ -+static unsigned long memcmp_cost; -+ -+static unsigned long rshash_neg_cont_zero; -+static unsigned long rshash_cont_obscure; -+ -+/* The possible states of hash strength adjustment heuristic */ -+enum rshash_states { -+ RSHASH_STILL, -+ RSHASH_TRYUP, -+ RSHASH_TRYDOWN, -+ RSHASH_NEW, -+ RSHASH_PRE_STILL, -+}; -+ -+/* The possible direction we are about to adjust hash strength */ -+enum rshash_direct { -+ GO_UP, -+ GO_DOWN, -+ OBSCURE, -+ STILL, -+}; -+ -+/* random sampling hash state machine */ -+static struct { -+ enum rshash_states state; -+ enum rshash_direct pre_direct; -+ u8 below_count; -+ /* Keep a lookup window of size 5, iff above_count/below_count > 3 -+ * in this window we stop trying. -+ */ -+ u8 lookup_window_index; -+ u64 stable_benefit; -+ unsigned long turn_point_down; -+ unsigned long turn_benefit_down; -+ unsigned long turn_point_up; -+ unsigned long turn_benefit_up; -+ unsigned long stable_point; -+} rshash_state; -+ -+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ -+static u32 *zero_hash_table; -+ -+static inline struct node_vma *alloc_node_vma(void) -+{ -+ struct node_vma *node_vma; -+ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); -+ if (node_vma) { -+ INIT_HLIST_HEAD(&node_vma->rmap_hlist); -+ INIT_HLIST_NODE(&node_vma->hlist); -+ } -+ return node_vma; -+} -+ -+static inline void free_node_vma(struct node_vma *node_vma) -+{ -+ kmem_cache_free(node_vma_cache, node_vma); -+} -+ -+ -+static inline struct vma_slot *alloc_vma_slot(void) -+{ -+ struct vma_slot *slot; -+ -+ /* -+ * In case ksm is not initialized by now. -+ * Oops, we need to consider the call site of uksm_init() in the future. -+ */ -+ if (!vma_slot_cache) -+ return NULL; -+ -+ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); -+ if (slot) { -+ INIT_LIST_HEAD(&slot->slot_list); -+ INIT_LIST_HEAD(&slot->dedup_list); -+ slot->flags |= UKSM_SLOT_NEED_RERAND; -+ } -+ return slot; -+} -+ -+static inline void free_vma_slot(struct vma_slot *vma_slot) -+{ -+ kmem_cache_free(vma_slot_cache, vma_slot); -+} -+ -+ -+ -+static inline struct rmap_item *alloc_rmap_item(void) -+{ -+ struct rmap_item *rmap_item; -+ -+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); -+ if (rmap_item) { -+ /* bug on lowest bit is not clear for flag use */ -+ BUG_ON(is_addr(rmap_item)); -+ } -+ return rmap_item; -+} -+ -+static inline void free_rmap_item(struct rmap_item *rmap_item) -+{ -+ rmap_item->slot = NULL; /* debug safety */ -+ kmem_cache_free(rmap_item_cache, rmap_item); -+} -+ -+static inline struct stable_node *alloc_stable_node(void) -+{ -+ struct stable_node *node; -+ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); -+ if (!node) -+ return NULL; -+ -+ INIT_HLIST_HEAD(&node->hlist); -+ list_add(&node->all_list, &stable_node_list); -+ return node; -+} -+ -+static inline void free_stable_node(struct stable_node *stable_node) -+{ -+ list_del(&stable_node->all_list); -+ kmem_cache_free(stable_node_cache, stable_node); -+} -+ -+static inline struct tree_node *alloc_tree_node(struct list_head *list) -+{ -+ struct tree_node *node; -+ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); -+ if (!node) -+ return NULL; -+ -+ list_add(&node->all_list, list); -+ return node; -+} -+ -+static inline void free_tree_node(struct tree_node *node) -+{ -+ list_del(&node->all_list); -+ kmem_cache_free(tree_node_cache, node); -+} -+ -+static void uksm_drop_anon_vma(struct rmap_item *rmap_item) -+{ -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ -+ put_anon_vma(anon_vma); -+} -+ -+ -+/** -+ * Remove a stable node from stable_tree, may unlink from its tree_node and -+ * may remove its parent tree_node if no other stable node is pending. -+ * -+ * @stable_node The node need to be removed -+ * @unlink_rb Will this node be unlinked from the rbtree? -+ * @remove_tree_ node Will its tree_node be removed if empty? -+ */ -+static void remove_node_from_stable_tree(struct stable_node *stable_node, -+ int unlink_rb, int remove_tree_node) -+{ -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ struct hlist_node *n; -+ -+ if (!hlist_empty(&stable_node->hlist)) { -+ hlist_for_each_entry_safe(node_vma, n, -+ &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ uksm_pages_sharing--; -+ -+ uksm_drop_anon_vma(rmap_item); -+ rmap_item->address &= PAGE_MASK; -+ } -+ free_node_vma(node_vma); -+ cond_resched(); -+ } -+ -+ /* the last one is counted as shared */ -+ uksm_pages_shared--; -+ uksm_pages_sharing++; -+ } -+ -+ if (stable_node->tree_node && unlink_rb) { -+ rb_erase(&stable_node->node, -+ &stable_node->tree_node->sub_root); -+ -+ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && -+ remove_tree_node) { -+ rb_erase(&stable_node->tree_node->node, -+ root_stable_treep); -+ free_tree_node(stable_node->tree_node); -+ } else { -+ stable_node->tree_node->count--; -+ } -+ } -+ -+ free_stable_node(stable_node); -+} -+ -+ -+/* -+ * get_uksm_page: checks if the page indicated by the stable node -+ * is still its ksm page, despite having held no reference to it. -+ * In which case we can trust the content of the page, and it -+ * returns the gotten page; but if the page has now been zapped, -+ * remove the stale node from the stable tree and return NULL. -+ * -+ * You would expect the stable_node to hold a reference to the ksm page. -+ * But if it increments the page's count, swapping out has to wait for -+ * ksmd to come around again before it can free the page, which may take -+ * seconds or even minutes: much too unresponsive. So instead we use a -+ * "keyhole reference": access to the ksm page from the stable node peeps -+ * out through its keyhole to see if that page still holds the right key, -+ * pointing back to this stable node. This relies on freeing a PageAnon -+ * page to reset its page->mapping to NULL, and relies on no other use of -+ * a page to put something that might look like our key in page->mapping. -+ * -+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, -+ * but this is different - made simpler by uksm_thread_mutex being held, but -+ * interesting for assuming that no other use of the struct page could ever -+ * put our expected_mapping into page->mapping (or a field of the union which -+ * coincides with page->mapping). The RCU calls are not for KSM at all, but -+ * to keep the page_count protocol described with page_cache_get_speculative. -+ * -+ * Note: it is possible that get_uksm_page() will return NULL one moment, -+ * then page the next, if the page is in between page_freeze_refs() and -+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page -+ * is on its way to being freed; but it is an anomaly to bear in mind. -+ * -+ * @unlink_rb: if the removal of this node will firstly unlink from -+ * its rbtree. stable_node_reinsert will prevent this when restructuring the -+ * node from its old tree. -+ * -+ * @remove_tree_node: if this is the last one of its tree_node, will the -+ * tree_node be freed ? If we are inserting stable node, this tree_node may -+ * be reused, so don't free it. -+ */ -+static struct page *get_uksm_page(struct stable_node *stable_node, -+ int unlink_rb, int remove_tree_node) -+{ -+ struct page *page; -+ void *expected_mapping; -+ -+ page = pfn_to_page(stable_node->kpfn); -+ expected_mapping = (void *)stable_node + -+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); -+ rcu_read_lock(); -+ if (page->mapping != expected_mapping) -+ goto stale; -+ if (!get_page_unless_zero(page)) -+ goto stale; -+ if (page->mapping != expected_mapping) { -+ put_page(page); -+ goto stale; -+ } -+ rcu_read_unlock(); -+ return page; -+stale: -+ rcu_read_unlock(); -+ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); -+ -+ return NULL; -+} -+ -+/* -+ * Removing rmap_item from stable or unstable tree. -+ * This function will clean the information from the stable/unstable tree. -+ */ -+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) -+{ -+ if (rmap_item->address & STABLE_FLAG) { -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct page *page; -+ -+ node_vma = rmap_item->head; -+ stable_node = node_vma->head; -+ page = get_uksm_page(stable_node, 1, 1); -+ if (!page) -+ goto out; -+ -+ /* -+ * page lock is needed because it's racing with -+ * try_to_unmap_ksm(), etc. -+ */ -+ lock_page(page); -+ hlist_del(&rmap_item->hlist); -+ -+ if (hlist_empty(&node_vma->rmap_hlist)) { -+ hlist_del(&node_vma->hlist); -+ free_node_vma(node_vma); -+ } -+ unlock_page(page); -+ -+ put_page(page); -+ if (hlist_empty(&stable_node->hlist)) { -+ /* do NOT call remove_node_from_stable_tree() here, -+ * it's possible for a forked rmap_item not in -+ * stable tree while the in-tree rmap_items were -+ * deleted. -+ */ -+ uksm_pages_shared--; -+ } else -+ uksm_pages_sharing--; -+ -+ -+ uksm_drop_anon_vma(rmap_item); -+ } else if (rmap_item->address & UNSTABLE_FLAG) { -+ if (rmap_item->hash_round == uksm_hash_round) { -+ -+ rb_erase(&rmap_item->node, -+ &rmap_item->tree_node->sub_root); -+ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { -+ rb_erase(&rmap_item->tree_node->node, -+ &root_unstable_tree); -+ -+ free_tree_node(rmap_item->tree_node); -+ } else -+ rmap_item->tree_node->count--; -+ } -+ uksm_pages_unshared--; -+ } -+ -+ rmap_item->address &= PAGE_MASK; -+ rmap_item->hash_max = 0; -+ -+out: -+ cond_resched(); /* we're called from many long loops */ -+} -+ -+static inline int slot_in_uksm(struct vma_slot *slot) -+{ -+ return list_empty(&slot->slot_list); -+} -+ -+/* -+ * Test if the mm is exiting -+ */ -+static inline bool uksm_test_exit(struct mm_struct *mm) -+{ -+ return atomic_read(&mm->mm_users) == 0; -+} -+ -+/** -+ * Need to do two things: -+ * 1. check if slot was moved to del list -+ * 2. make sure the mmap_sem is manipulated under valid vma. -+ * -+ * My concern here is that in some cases, this may make -+ * vma_slot_list_lock() waiters to serialized further by some -+ * sem->wait_lock, can this really be expensive? -+ * -+ * -+ * @return -+ * 0: if successfully locked mmap_sem -+ * -ENOENT: this slot was moved to del list -+ * -EBUSY: vma lock failed -+ */ -+static int try_down_read_slot_mmap_sem(struct vma_slot *slot) -+{ -+ struct vm_area_struct *vma; -+ struct mm_struct *mm; -+ struct rw_semaphore *sem; -+ -+ spin_lock(&vma_slot_list_lock); -+ -+ /* the slot_list was removed and inited from new list, when it enters -+ * uksm_list. If now it's not empty, then it must be moved to del list -+ */ -+ if (!slot_in_uksm(slot)) { -+ spin_unlock(&vma_slot_list_lock); -+ return -ENOENT; -+ } -+ -+ BUG_ON(slot->pages != vma_pages(slot->vma)); -+ /* Ok, vma still valid */ -+ vma = slot->vma; -+ mm = vma->vm_mm; -+ sem = &mm->mmap_sem; -+ -+ if (uksm_test_exit(mm)) { -+ spin_unlock(&vma_slot_list_lock); -+ return -ENOENT; -+ } -+ -+ if (down_read_trylock(sem)) { -+ spin_unlock(&vma_slot_list_lock); -+ return 0; -+ } -+ -+ spin_unlock(&vma_slot_list_lock); -+ return -EBUSY; -+} -+ -+static inline unsigned long -+vma_page_address(struct page *page, struct vm_area_struct *vma) -+{ -+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); -+ unsigned long address; -+ -+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); -+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { -+ /* page should be within @vma mapping range */ -+ return -EFAULT; -+ } -+ return address; -+} -+ -+ -+/* return 0 on success with the item's mmap_sem locked */ -+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) -+{ -+ struct mm_struct *mm; -+ struct vma_slot *slot = item->slot; -+ int err = -EINVAL; -+ -+ struct page *page; -+ -+ /* -+ * try_down_read_slot_mmap_sem() returns non-zero if the slot -+ * has been removed by uksm_remove_vma(). -+ */ -+ if (try_down_read_slot_mmap_sem(slot)) -+ return -EBUSY; -+ -+ mm = slot->vma->vm_mm; -+ -+ if (uksm_test_exit(mm)) -+ goto failout_up; -+ -+ page = item->page; -+ rcu_read_lock(); -+ if (!get_page_unless_zero(page)) { -+ rcu_read_unlock(); -+ goto failout_up; -+ } -+ -+ /* No need to consider huge page here. */ -+ if (item->slot->vma->anon_vma != page_anon_vma(page) || -+ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { -+ /* -+ * TODO: -+ * should we release this item becase of its stale page -+ * mapping? -+ */ -+ put_page(page); -+ rcu_read_unlock(); -+ goto failout_up; -+ } -+ rcu_read_unlock(); -+ return 0; -+ -+failout_up: -+ up_read(&mm->mmap_sem); -+ return err; -+} -+ -+/* -+ * What kind of VMA is considered ? -+ */ -+static inline int vma_can_enter(struct vm_area_struct *vma) -+{ -+ return uksm_flags_can_scan(vma->vm_flags); -+} -+ -+/* -+ * Called whenever a fresh new vma is created A new vma_slot. -+ * is created and inserted into a global list Must be called. -+ * after vma is inserted to its mm . -+ */ -+void uksm_vma_add_new(struct vm_area_struct *vma) -+{ -+ struct vma_slot *slot; -+ -+ if (!vma_can_enter(vma)) { -+ vma->uksm_vma_slot = NULL; -+ return; -+ } -+ -+ slot = alloc_vma_slot(); -+ if (!slot) { -+ vma->uksm_vma_slot = NULL; -+ return; -+ } -+ -+ vma->uksm_vma_slot = slot; -+ vma->vm_flags |= VM_MERGEABLE; -+ slot->vma = vma; -+ slot->mm = vma->vm_mm; -+ slot->ctime_j = jiffies; -+ slot->pages = vma_pages(vma); -+ spin_lock(&vma_slot_list_lock); -+ list_add_tail(&slot->slot_list, &vma_slot_new); -+ spin_unlock(&vma_slot_list_lock); -+} -+ -+/* -+ * Called after vma is unlinked from its mm -+ */ -+void uksm_remove_vma(struct vm_area_struct *vma) -+{ -+ struct vma_slot *slot; -+ -+ if (!vma->uksm_vma_slot) -+ return; -+ -+ slot = vma->uksm_vma_slot; -+ spin_lock(&vma_slot_list_lock); -+ if (slot_in_uksm(slot)) { -+ /** -+ * This slot has been added by ksmd, so move to the del list -+ * waiting ksmd to free it. -+ */ -+ list_add_tail(&slot->slot_list, &vma_slot_del); -+ } else { -+ /** -+ * It's still on new list. It's ok to free slot directly. -+ */ -+ list_del(&slot->slot_list); -+ free_vma_slot(slot); -+ } -+ spin_unlock(&vma_slot_list_lock); -+ vma->uksm_vma_slot = NULL; -+} -+ -+/* 32/3 < they < 32/2 */ -+#define shiftl 8 -+#define shiftr 12 -+ -+#define HASH_FROM_TO(from, to) \ -+for (index = from; index < to; index++) { \ -+ pos = random_nums[index]; \ -+ hash += key[pos]; \ -+ hash += (hash << shiftl); \ -+ hash ^= (hash >> shiftr); \ -+} -+ -+ -+#define HASH_FROM_DOWN_TO(from, to) \ -+for (index = from - 1; index >= to; index--) { \ -+ hash ^= (hash >> shiftr); \ -+ hash ^= (hash >> (shiftr*2)); \ -+ hash -= (hash << shiftl); \ -+ hash += (hash << (shiftl*2)); \ -+ pos = random_nums[index]; \ -+ hash -= key[pos]; \ -+} -+ -+/* -+ * The main random sample hash function. -+ */ -+static u32 random_sample_hash(void *addr, u32 hash_strength) -+{ -+ u32 hash = 0xdeadbeef; -+ int index, pos, loop = hash_strength; -+ u32 *key = (u32 *)addr; -+ -+ if (loop > HASH_STRENGTH_FULL) -+ loop = HASH_STRENGTH_FULL; -+ -+ HASH_FROM_TO(0, loop); -+ -+ if (hash_strength > HASH_STRENGTH_FULL) { -+ loop = hash_strength - HASH_STRENGTH_FULL; -+ HASH_FROM_TO(0, loop); -+ } -+ -+ return hash; -+} -+ -+ -+/** -+ * It's used when hash strength is adjusted -+ * -+ * @addr The page's virtual address -+ * @from The original hash strength -+ * @to The hash strength changed to -+ * @hash The hash value generated with "from" hash value -+ * -+ * return the hash value -+ */ -+static u32 delta_hash(void *addr, int from, int to, u32 hash) -+{ -+ u32 *key = (u32 *)addr; -+ int index, pos; /* make sure they are int type */ -+ -+ if (to > from) { -+ if (from >= HASH_STRENGTH_FULL) { -+ from -= HASH_STRENGTH_FULL; -+ to -= HASH_STRENGTH_FULL; -+ HASH_FROM_TO(from, to); -+ } else if (to <= HASH_STRENGTH_FULL) { -+ HASH_FROM_TO(from, to); -+ } else { -+ HASH_FROM_TO(from, HASH_STRENGTH_FULL); -+ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); -+ } -+ } else { -+ if (from <= HASH_STRENGTH_FULL) { -+ HASH_FROM_DOWN_TO(from, to); -+ } else if (to >= HASH_STRENGTH_FULL) { -+ from -= HASH_STRENGTH_FULL; -+ to -= HASH_STRENGTH_FULL; -+ HASH_FROM_DOWN_TO(from, to); -+ } else { -+ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); -+ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); -+ } -+ } -+ -+ return hash; -+} -+ -+ -+ -+ -+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) -+ -+/** -+ * -+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round -+ * has finished. -+ * -+ * return 0 if no page has been scanned since last call, 1 otherwise. -+ */ -+static inline int encode_benefit(void) -+{ -+ u64 scanned_delta, pos_delta, neg_delta; -+ unsigned long base = benefit.base; -+ -+ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; -+ -+ if (!scanned_delta) -+ return 0; -+ -+ scanned_delta >>= base; -+ pos_delta = rshash_pos >> base; -+ neg_delta = rshash_neg >> base; -+ -+ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || -+ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || -+ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { -+ benefit.scanned >>= 1; -+ benefit.neg >>= 1; -+ benefit.pos >>= 1; -+ benefit.base++; -+ scanned_delta >>= 1; -+ pos_delta >>= 1; -+ neg_delta >>= 1; -+ } -+ -+ benefit.pos += pos_delta; -+ benefit.neg += neg_delta; -+ benefit.scanned += scanned_delta; -+ -+ BUG_ON(!benefit.scanned); -+ -+ rshash_pos = rshash_neg = 0; -+ uksm_pages_scanned_last = uksm_pages_scanned; -+ -+ return 1; -+} -+ -+static inline void reset_benefit(void) -+{ -+ benefit.pos = 0; -+ benefit.neg = 0; -+ benefit.base = 0; -+ benefit.scanned = 0; -+} -+ -+static inline void inc_rshash_pos(unsigned long delta) -+{ -+ if (CAN_OVERFLOW_U64(rshash_pos, delta)) -+ encode_benefit(); -+ -+ rshash_pos += delta; -+} -+ -+static inline void inc_rshash_neg(unsigned long delta) -+{ -+ if (CAN_OVERFLOW_U64(rshash_neg, delta)) -+ encode_benefit(); -+ -+ rshash_neg += delta; -+} -+ -+ -+static inline u32 page_hash(struct page *page, unsigned long hash_strength, -+ int cost_accounting) -+{ -+ u32 val; -+ unsigned long delta; -+ -+ void *addr = kmap_atomic(page); -+ -+ val = random_sample_hash(addr, hash_strength); -+ kunmap_atomic(addr); -+ -+ if (cost_accounting) { -+ if (HASH_STRENGTH_FULL > hash_strength) -+ delta = HASH_STRENGTH_FULL - hash_strength; -+ else -+ delta = 0; -+ -+ inc_rshash_pos(delta); -+ } -+ -+ return val; -+} -+ -+static int memcmp_pages(struct page *page1, struct page *page2, -+ int cost_accounting) -+{ -+ char *addr1, *addr2; -+ int ret; -+ -+ addr1 = kmap_atomic(page1); -+ addr2 = kmap_atomic(page2); -+ ret = memcmp(addr1, addr2, PAGE_SIZE); -+ kunmap_atomic(addr2); -+ kunmap_atomic(addr1); -+ -+ if (cost_accounting) -+ inc_rshash_neg(memcmp_cost); -+ -+ return ret; -+} -+ -+static inline int pages_identical(struct page *page1, struct page *page2) -+{ -+ return !memcmp_pages(page1, page2, 0); -+} -+ -+static inline int is_page_full_zero(struct page *page) -+{ -+ char *addr; -+ int ret; -+ -+ addr = kmap_atomic(page); -+ ret = is_full_zero(addr, PAGE_SIZE); -+ kunmap_atomic(addr); -+ -+ return ret; -+} -+ -+static int write_protect_page(struct vm_area_struct *vma, struct page *page, -+ pte_t *orig_pte, pte_t *old_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ unsigned long addr; -+ pte_t *ptep; -+ spinlock_t *ptl; -+ int swapped; -+ int err = -EFAULT; -+ unsigned long mmun_start; /* For mmu_notifiers */ -+ unsigned long mmun_end; /* For mmu_notifiers */ -+ -+ addr = page_address_in_vma(page, vma); -+ if (addr == -EFAULT) -+ goto out; -+ -+ BUG_ON(PageTransCompound(page)); -+ -+ mmun_start = addr; -+ mmun_end = addr + PAGE_SIZE; -+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); -+ -+ ptep = page_check_address(page, mm, addr, &ptl, 0); -+ if (!ptep) -+ goto out_mn; -+ -+ if (old_pte) -+ *old_pte = *ptep; -+ -+ if (pte_write(*ptep) || pte_dirty(*ptep)) { -+ pte_t entry; -+ -+ swapped = PageSwapCache(page); -+ flush_cache_page(vma, addr, page_to_pfn(page)); -+ /* -+ * Ok this is tricky, when get_user_pages_fast() run it doesnt -+ * take any lock, therefore the check that we are going to make -+ * with the pagecount against the mapcount is racey and -+ * O_DIRECT can happen right after the check. -+ * So we clear the pte and flush the tlb before the check -+ * this assure us that no O_DIRECT can happen after the check -+ * or in the middle of the check. -+ */ -+ entry = ptep_clear_flush(vma, addr, ptep); -+ /* -+ * Check that no O_DIRECT or similar I/O is in progress on the -+ * page -+ */ -+ if (page_mapcount(page) + 1 + swapped != page_count(page)) { -+ set_pte_at(mm, addr, ptep, entry); -+ goto out_unlock; -+ } -+ if (pte_dirty(entry)) -+ set_page_dirty(page); -+ entry = pte_mkclean(pte_wrprotect(entry)); -+ set_pte_at_notify(mm, addr, ptep, entry); -+ } -+ *orig_pte = *ptep; -+ err = 0; -+ -+out_unlock: -+ pte_unmap_unlock(ptep, ptl); -+out_mn: -+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); -+out: -+ return err; -+} -+ -+#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ -+#define MERGE_ERR_COLLI 2 /* there is a collision */ -+#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ -+#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ -+ -+ -+/** -+ * replace_page - replace page in vma by new ksm page -+ * @vma: vma that holds the pte pointing to page -+ * @page: the page we are replacing by kpage -+ * @kpage: the ksm page we replace page by -+ * @orig_pte: the original value of the pte -+ * -+ * Returns 0 on success, MERGE_ERR_PGERR on failure. -+ */ -+static int replace_page(struct vm_area_struct *vma, struct page *page, -+ struct page *kpage, pte_t orig_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ pgd_t *pgd; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *ptep; -+ spinlock_t *ptl; -+ pte_t entry; -+ -+ unsigned long addr; -+ int err = MERGE_ERR_PGERR; -+ unsigned long mmun_start; /* For mmu_notifiers */ -+ unsigned long mmun_end; /* For mmu_notifiers */ -+ -+ addr = page_address_in_vma(page, vma); -+ if (addr == -EFAULT) -+ goto out; -+ -+ pgd = pgd_offset(mm, addr); -+ if (!pgd_present(*pgd)) -+ goto out; -+ -+ pud = pud_offset(pgd, addr); -+ if (!pud_present(*pud)) -+ goto out; -+ -+ pmd = pmd_offset(pud, addr); -+ BUG_ON(pmd_trans_huge(*pmd)); -+ if (!pmd_present(*pmd)) -+ goto out; -+ -+ mmun_start = addr; -+ mmun_end = addr + PAGE_SIZE; -+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); -+ -+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); -+ if (!pte_same(*ptep, orig_pte)) { -+ pte_unmap_unlock(ptep, ptl); -+ goto out_mn; -+ } -+ -+ flush_cache_page(vma, addr, pte_pfn(*ptep)); -+ ptep_clear_flush(vma, addr, ptep); -+ entry = mk_pte(kpage, vma->vm_page_prot); -+ -+ /* special treatment is needed for zero_page */ -+ if ((page_to_pfn(kpage) == uksm_zero_pfn) || -+ (page_to_pfn(kpage) == zero_pfn)) -+ entry = pte_mkspecial(entry); -+ else { -+ get_page(kpage); -+ page_add_anon_rmap(kpage, vma, addr); -+ } -+ -+ set_pte_at_notify(mm, addr, ptep, entry); -+ -+ page_remove_rmap(page); -+ if (!page_mapped(page)) -+ try_to_free_swap(page); -+ put_page(page); -+ -+ pte_unmap_unlock(ptep, ptl); -+ err = 0; -+out_mn: -+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); -+out: -+ return err; -+} -+ -+ -+/** -+ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The -+ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its -+ * hash_max member has not been calculated. -+ * -+ * @page The page needs to be hashed -+ * @hash_old The hash value calculated with current hash strength -+ * -+ * return the new hash value calculated at HASH_STRENGTH_MAX -+ */ -+static inline u32 page_hash_max(struct page *page, u32 hash_old) -+{ -+ u32 hash_max = 0; -+ void *addr; -+ -+ addr = kmap_atomic(page); -+ hash_max = delta_hash(addr, hash_strength, -+ HASH_STRENGTH_MAX, hash_old); -+ -+ kunmap_atomic(addr); -+ -+ if (!hash_max) -+ hash_max = 1; -+ -+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); -+ return hash_max; -+} -+ -+/* -+ * We compare the hash again, to ensure that it is really a hash collision -+ * instead of being caused by page write. -+ */ -+static inline int check_collision(struct rmap_item *rmap_item, -+ u32 hash) -+{ -+ int err; -+ struct page *page = rmap_item->page; -+ -+ /* if this rmap_item has already been hash_maxed, then the collision -+ * must appears in the second-level rbtree search. In this case we check -+ * if its hash_max value has been changed. Otherwise, the collision -+ * happens in the first-level rbtree search, so we check against it's -+ * current hash value. -+ */ -+ if (rmap_item->hash_max) { -+ inc_rshash_neg(memcmp_cost); -+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); -+ -+ if (rmap_item->hash_max == page_hash_max(page, hash)) -+ err = MERGE_ERR_COLLI; -+ else -+ err = MERGE_ERR_CHANGED; -+ } else { -+ inc_rshash_neg(memcmp_cost + hash_strength); -+ -+ if (page_hash(page, hash_strength, 0) == hash) -+ err = MERGE_ERR_COLLI; -+ else -+ err = MERGE_ERR_CHANGED; -+ } -+ -+ return err; -+} -+ -+static struct page *page_trans_compound_anon(struct page *page) -+{ -+ if (PageTransCompound(page)) { -+ struct page *head = compound_trans_head(page); -+ /* -+ * head may actually be splitted and freed from under -+ * us but it's ok here. -+ */ -+ if (PageAnon(head)) -+ return head; -+ } -+ return NULL; -+} -+ -+static int page_trans_compound_anon_split(struct page *page) -+{ -+ int ret = 0; -+ struct page *transhuge_head = page_trans_compound_anon(page); -+ if (transhuge_head) { -+ /* Get the reference on the head to split it. */ -+ if (get_page_unless_zero(transhuge_head)) { -+ /* -+ * Recheck we got the reference while the head -+ * was still anonymous. -+ */ -+ if (PageAnon(transhuge_head)) -+ ret = split_huge_page(transhuge_head); -+ else -+ /* -+ * Retry later if split_huge_page run -+ * from under us. -+ */ -+ ret = 1; -+ put_page(transhuge_head); -+ } else -+ /* Retry later if split_huge_page run from under us. */ -+ ret = 1; -+ } -+ return ret; -+} -+ -+/** -+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must -+ * already be a ksm page. -+ * -+ * @return 0 if the pages were merged, -EFAULT otherwise. -+ */ -+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, -+ struct page *kpage, u32 hash) -+{ -+ struct vm_area_struct *vma = rmap_item->slot->vma; -+ struct mm_struct *mm = vma->vm_mm; -+ pte_t orig_pte = __pte(0); -+ int err = MERGE_ERR_PGERR; -+ struct page *page; -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ page = rmap_item->page; -+ -+ if (page == kpage) { /* ksm page forked */ -+ err = 0; -+ goto out; -+ } -+ -+ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) -+ goto out; -+ BUG_ON(PageTransCompound(page)); -+ -+ if (!PageAnon(page) || !PageKsm(kpage)) -+ goto out; -+ -+ /* -+ * We need the page lock to read a stable PageSwapCache in -+ * write_protect_page(). We use trylock_page() instead of -+ * lock_page() because we don't want to wait here - we -+ * prefer to continue scanning and merging different pages, -+ * then come back to this page when it is unlocked. -+ */ -+ if (!trylock_page(page)) -+ goto out; -+ /* -+ * If this anonymous page is mapped only here, its pte may need -+ * to be write-protected. If it's mapped elsewhere, all of its -+ * ptes are necessarily already write-protected. But in either -+ * case, we need to lock and check page_count is not raised. -+ */ -+ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { -+ if (pages_identical(page, kpage)) -+ err = replace_page(vma, page, kpage, orig_pte); -+ else -+ err = check_collision(rmap_item, hash); -+ } -+ -+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { -+ munlock_vma_page(page); -+ if (!PageMlocked(kpage)) { -+ unlock_page(page); -+ lock_page(kpage); -+ mlock_vma_page(kpage); -+ page = kpage; /* for final unlock */ -+ } -+ } -+ -+ unlock_page(page); -+out: -+ return err; -+} -+ -+ -+ -+/** -+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance -+ * to restore a page mapping that has been changed in try_to_merge_two_pages. -+ * -+ * @return 0 on success. -+ */ -+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, -+ pte_t orig_pte, pte_t wprt_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ pgd_t *pgd; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *ptep; -+ spinlock_t *ptl; -+ -+ int err = -EFAULT; -+ -+ pgd = pgd_offset(mm, addr); -+ if (!pgd_present(*pgd)) -+ goto out; -+ -+ pud = pud_offset(pgd, addr); -+ if (!pud_present(*pud)) -+ goto out; -+ -+ pmd = pmd_offset(pud, addr); -+ if (!pmd_present(*pmd)) -+ goto out; -+ -+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); -+ if (!pte_same(*ptep, wprt_pte)) { -+ /* already copied, let it be */ -+ pte_unmap_unlock(ptep, ptl); -+ goto out; -+ } -+ -+ /* -+ * Good boy, still here. When we still get the ksm page, it does not -+ * return to the free page pool, there is no way that a pte was changed -+ * to other page and gets back to this page. And remind that ksm page -+ * do not reuse in do_wp_page(). So it's safe to restore the original -+ * pte. -+ */ -+ flush_cache_page(vma, addr, pte_pfn(*ptep)); -+ ptep_clear_flush(vma, addr, ptep); -+ set_pte_at_notify(mm, addr, ptep, orig_pte); -+ -+ pte_unmap_unlock(ptep, ptl); -+ err = 0; -+out: -+ return err; -+} -+ -+/** -+ * try_to_merge_two_pages() - take two identical pages and prepare -+ * them to be merged into one page(rmap_item->page) -+ * -+ * @return 0 if we successfully merged two identical pages into -+ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision -+ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been -+ * changed since it's hashed. MERGE_ERR_PGERR otherwise. -+ * -+ */ -+static int try_to_merge_two_pages(struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ u32 hash) -+{ -+ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); -+ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); -+ struct vm_area_struct *vma1 = rmap_item->slot->vma; -+ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; -+ struct page *page = rmap_item->page; -+ struct page *tree_page = tree_rmap_item->page; -+ int err = MERGE_ERR_PGERR; -+ struct address_space *saved_mapping; -+ -+ -+ if (rmap_item->page == tree_rmap_item->page) -+ goto out; -+ -+ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) -+ goto out; -+ BUG_ON(PageTransCompound(page)); -+ -+ if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) -+ goto out; -+ BUG_ON(PageTransCompound(tree_page)); -+ -+ if (!PageAnon(page) || !PageAnon(tree_page)) -+ goto out; -+ -+ if (!trylock_page(page)) -+ goto out; -+ -+ -+ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { -+ unlock_page(page); -+ goto out; -+ } -+ -+ /* -+ * While we hold page lock, upgrade page from -+ * PageAnon+anon_vma to PageKsm+NULL stable_node: -+ * stable_tree_insert() will update stable_node. -+ */ -+ saved_mapping = page->mapping; -+ set_page_stable_node(page, NULL); -+ mark_page_accessed(page); -+ unlock_page(page); -+ -+ if (!trylock_page(tree_page)) -+ goto restore_out; -+ -+ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if (pages_identical(page, tree_page)) { -+ err = replace_page(vma2, tree_page, page, wprt_pte2); -+ if (err) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if ((vma2->vm_flags & VM_LOCKED)) { -+ munlock_vma_page(tree_page); -+ if (!PageMlocked(page)) { -+ unlock_page(tree_page); -+ lock_page(page); -+ mlock_vma_page(page); -+ tree_page = page; /* for final unlock */ -+ } -+ } -+ -+ unlock_page(tree_page); -+ -+ goto out; /* success */ -+ -+ } else { -+ if (tree_rmap_item->hash_max && -+ tree_rmap_item->hash_max == rmap_item->hash_max) { -+ err = MERGE_ERR_COLLI_MAX; -+ } else if (page_hash(page, hash_strength, 0) == -+ page_hash(tree_page, hash_strength, 0)) { -+ inc_rshash_neg(memcmp_cost + hash_strength * 2); -+ err = MERGE_ERR_COLLI; -+ } else { -+ err = MERGE_ERR_CHANGED; -+ } -+ -+ unlock_page(tree_page); -+ } -+ -+restore_out: -+ lock_page(page); -+ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), -+ orig_pte1, wprt_pte1)) -+ page->mapping = saved_mapping; -+ -+ unlock_page(page); -+out: -+ return err; -+} -+ -+static inline int hash_cmp(u32 new_val, u32 node_val) -+{ -+ if (new_val > node_val) -+ return 1; -+ else if (new_val < node_val) -+ return -1; -+ else -+ return 0; -+} -+ -+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) -+{ -+ u32 hash_max = item->hash_max; -+ -+ if (!hash_max) { -+ hash_max = page_hash_max(item->page, hash); -+ -+ item->hash_max = hash_max; -+ } -+ -+ return hash_max; -+} -+ -+ -+ -+/** -+ * stable_tree_search() - search the stable tree for a page -+ * -+ * @item: the rmap_item we are comparing with -+ * @hash: the hash value of this item->page already calculated -+ * -+ * @return the page we have found, NULL otherwise. The page returned has -+ * been gotten. -+ */ -+static struct page *stable_tree_search(struct rmap_item *item, u32 hash) -+{ -+ struct rb_node *node = root_stable_treep->rb_node; -+ struct tree_node *tree_node; -+ unsigned long hash_max; -+ struct page *page = item->page; -+ struct stable_node *stable_node; -+ -+ stable_node = page_stable_node(page); -+ if (stable_node) { -+ /* ksm page forked, that is -+ * if (PageKsm(page) && !in_stable_tree(rmap_item)) -+ * it's actually gotten once outside. -+ */ -+ get_page(page); -+ return page; -+ } -+ -+ while (node) { -+ int cmp; -+ -+ tree_node = rb_entry(node, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) -+ node = node->rb_left; -+ else if (cmp > 0) -+ node = node->rb_right; -+ else -+ break; -+ } -+ -+ if (!node) -+ return NULL; -+ -+ if (tree_node->count == 1) { -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ BUG_ON(!stable_node); -+ -+ goto get_page_out; -+ } -+ -+ /* -+ * ok, we have to search the second -+ * level subtree, hash the page to a -+ * full strength. -+ */ -+ node = tree_node->sub_root.rb_node; -+ BUG_ON(!node); -+ hash_max = rmap_item_hash_max(item, hash); -+ -+ while (node) { -+ int cmp; -+ -+ stable_node = rb_entry(node, struct stable_node, node); -+ -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ if (cmp < 0) -+ node = node->rb_left; -+ else if (cmp > 0) -+ node = node->rb_right; -+ else -+ goto get_page_out; -+ } -+ -+ return NULL; -+ -+get_page_out: -+ page = get_uksm_page(stable_node, 1, 1); -+ return page; -+} -+ -+static int try_merge_rmap_item(struct rmap_item *item, -+ struct page *kpage, -+ struct page *tree_page) -+{ -+ spinlock_t *ptl; -+ pte_t *ptep; -+ unsigned long addr; -+ struct vm_area_struct *vma = item->slot->vma; -+ -+ addr = get_rmap_addr(item); -+ ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); -+ if (!ptep) -+ return 0; -+ -+ if (pte_write(*ptep)) { -+ /* has changed, abort! */ -+ pte_unmap_unlock(ptep, ptl); -+ return 0; -+ } -+ -+ get_page(tree_page); -+ page_add_anon_rmap(tree_page, vma, addr); -+ -+ flush_cache_page(vma, addr, pte_pfn(*ptep)); -+ ptep_clear_flush(vma, addr, ptep); -+ set_pte_at_notify(vma->vm_mm, addr, ptep, -+ mk_pte(tree_page, vma->vm_page_prot)); -+ -+ page_remove_rmap(kpage); -+ put_page(kpage); -+ -+ pte_unmap_unlock(ptep, ptl); -+ -+ return 1; -+} -+ -+/** -+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted -+ * into stable tree, the page was found to be identical to a stable ksm page, -+ * this is the last chance we can merge them into one. -+ * -+ * @item1: the rmap_item holding the page which we wanted to insert -+ * into stable tree. -+ * @item2: the other rmap_item we found when unstable tree search -+ * @oldpage: the page currently mapped by the two rmap_items -+ * @tree_page: the page we found identical in stable tree node -+ * @success1: return if item1 is successfully merged -+ * @success2: return if item2 is successfully merged -+ */ -+static void try_merge_with_stable(struct rmap_item *item1, -+ struct rmap_item *item2, -+ struct page **kpage, -+ struct page *tree_page, -+ int *success1, int *success2) -+{ -+ struct vm_area_struct *vma1 = item1->slot->vma; -+ struct vm_area_struct *vma2 = item2->slot->vma; -+ *success1 = 0; -+ *success2 = 0; -+ -+ if (unlikely(*kpage == tree_page)) { -+ /* I don't think this can really happen */ -+ printk(KERN_WARNING "UKSM: unexpected condition detected in " -+ "try_merge_with_stable() -- *kpage == tree_page !\n"); -+ *success1 = 1; -+ *success2 = 1; -+ return; -+ } -+ -+ if (!PageAnon(*kpage) || !PageKsm(*kpage)) -+ goto failed; -+ -+ if (!trylock_page(tree_page)) -+ goto failed; -+ -+ /* If the oldpage is still ksm and still pointed -+ * to in the right place, and still write protected, -+ * we are confident it's not changed, no need to -+ * memcmp anymore. -+ * be ware, we cannot take nested pte locks, -+ * deadlock risk. -+ */ -+ if (!try_merge_rmap_item(item1, *kpage, tree_page)) -+ goto unlock_failed; -+ -+ /* ok, then vma2, remind that pte1 already set */ -+ if (!try_merge_rmap_item(item2, *kpage, tree_page)) -+ goto success_1; -+ -+ *success2 = 1; -+success_1: -+ *success1 = 1; -+ -+ -+ if ((*success1 && vma1->vm_flags & VM_LOCKED) || -+ (*success2 && vma2->vm_flags & VM_LOCKED)) { -+ munlock_vma_page(*kpage); -+ if (!PageMlocked(tree_page)) -+ mlock_vma_page(tree_page); -+ } -+ -+ /* -+ * We do not need oldpage any more in the caller, so can break the lock -+ * now. -+ */ -+ unlock_page(*kpage); -+ *kpage = tree_page; /* Get unlocked outside. */ -+ return; -+ -+unlock_failed: -+ unlock_page(tree_page); -+failed: -+ return; -+} -+ -+static inline void stable_node_hash_max(struct stable_node *node, -+ struct page *page, u32 hash) -+{ -+ u32 hash_max = node->hash_max; -+ -+ if (!hash_max) { -+ hash_max = page_hash_max(page, hash); -+ node->hash_max = hash_max; -+ } -+} -+ -+static inline -+struct stable_node *new_stable_node(struct tree_node *tree_node, -+ struct page *kpage, u32 hash_max) -+{ -+ struct stable_node *new_stable_node; -+ -+ new_stable_node = alloc_stable_node(); -+ if (!new_stable_node) -+ return NULL; -+ -+ new_stable_node->kpfn = page_to_pfn(kpage); -+ new_stable_node->hash_max = hash_max; -+ new_stable_node->tree_node = tree_node; -+ set_page_stable_node(kpage, new_stable_node); -+ -+ return new_stable_node; -+} -+ -+static inline -+struct stable_node *first_level_insert(struct tree_node *tree_node, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ struct page **kpage, u32 hash, -+ int *success1, int *success2) -+{ -+ int cmp; -+ struct page *tree_page; -+ u32 hash_max = 0; -+ struct stable_node *stable_node, *new_snode; -+ struct rb_node *parent = NULL, **new; -+ -+ /* this tree node contains no sub-tree yet */ -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ cmp = memcmp_pages(*kpage, tree_page, 1); -+ if (!cmp) { -+ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, -+ tree_page, success1, success2); -+ put_page(tree_page); -+ if (!*success1 && !*success2) -+ goto failed; -+ -+ return stable_node; -+ -+ } else { -+ /* -+ * collision in first level try to create a subtree. -+ * A new node need to be created. -+ */ -+ put_page(tree_page); -+ -+ stable_node_hash_max(stable_node, tree_page, -+ tree_node->hash); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ parent = &stable_node->node; -+ if (cmp < 0) { -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ new = &parent->rb_right; -+ } else { -+ goto failed; -+ } -+ } -+ -+ } else { -+ /* the only stable_node deleted, we reuse its tree_node. -+ */ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+ new_snode = new_stable_node(tree_node, *kpage, hash_max); -+ if (!new_snode) -+ goto failed; -+ -+ rb_link_node(&new_snode->node, parent, new); -+ rb_insert_color(&new_snode->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ -+ return new_snode; -+ -+failed: -+ return NULL; -+} -+ -+static inline -+struct stable_node *stable_subtree_insert(struct tree_node *tree_node, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ struct page **kpage, u32 hash, -+ int *success1, int *success2) -+{ -+ struct page *tree_page; -+ u32 hash_max; -+ struct stable_node *stable_node, *new_snode; -+ struct rb_node *parent, **new; -+ -+research: -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ BUG_ON(!*new); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ while (*new) { -+ int cmp; -+ -+ stable_node = rb_entry(*new, struct stable_node, node); -+ -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else { -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ cmp = memcmp_pages(*kpage, tree_page, 1); -+ if (!cmp) { -+ try_merge_with_stable(rmap_item, -+ tree_rmap_item, kpage, -+ tree_page, success1, success2); -+ -+ put_page(tree_page); -+ if (!*success1 && !*success2) -+ goto failed; -+ /* -+ * successfully merged with a stable -+ * node -+ */ -+ return stable_node; -+ } else { -+ put_page(tree_page); -+ goto failed; -+ } -+ } else { -+ /* -+ * stable node may be deleted, -+ * and subtree maybe -+ * restructed, cannot -+ * continue, research it. -+ */ -+ if (tree_node->count) { -+ goto research; -+ } else { -+ /* reuse the tree node*/ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ } -+ } -+ } -+ -+ new_snode = new_stable_node(tree_node, *kpage, hash_max); -+ if (!new_snode) -+ goto failed; -+ -+ rb_link_node(&new_snode->node, parent, new); -+ rb_insert_color(&new_snode->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ -+ return new_snode; -+ -+failed: -+ return NULL; -+} -+ -+ -+/** -+ * stable_tree_insert() - try to insert a merged page in unstable tree to -+ * the stable tree -+ * -+ * @kpage: the page need to be inserted -+ * @hash: the current hash of this page -+ * @rmap_item: the rmap_item being scanned -+ * @tree_rmap_item: the rmap_item found on unstable tree -+ * @success1: return if rmap_item is merged -+ * @success2: return if tree_rmap_item is merged -+ * -+ * @return the stable_node on stable tree if at least one -+ * rmap_item is inserted into stable tree, NULL -+ * otherwise. -+ */ -+static struct stable_node * -+stable_tree_insert(struct page **kpage, u32 hash, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ int *success1, int *success2) -+{ -+ struct rb_node **new = &root_stable_treep->rb_node; -+ struct rb_node *parent = NULL; -+ struct stable_node *stable_node; -+ struct tree_node *tree_node; -+ u32 hash_max = 0; -+ -+ *success1 = *success2 = 0; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ if (tree_node->count == 1) { -+ stable_node = first_level_insert(tree_node, rmap_item, -+ tree_rmap_item, kpage, -+ hash, success1, success2); -+ } else { -+ stable_node = stable_subtree_insert(tree_node, -+ rmap_item, tree_rmap_item, kpage, -+ hash, success1, success2); -+ } -+ } else { -+ -+ /* no tree node found */ -+ tree_node = alloc_tree_node(stable_tree_node_listp); -+ if (!tree_node) { -+ stable_node = NULL; -+ goto out; -+ } -+ -+ stable_node = new_stable_node(tree_node, *kpage, hash_max); -+ if (!stable_node) { -+ free_tree_node(tree_node); -+ goto out; -+ } -+ -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, root_stable_treep); -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ -+ rb_link_node(&stable_node->node, parent, new); -+ rb_insert_color(&stable_node->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ } -+ -+out: -+ return stable_node; -+} -+ -+ -+/** -+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem -+ * -+ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, -+ * -EINVAL if the page mapping has been changed. -+ */ -+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) -+{ -+ int err; -+ -+ err = get_mergeable_page_lock_mmap(tree_rmap_item); -+ -+ if (err == -EINVAL) { -+ /* its page map has been changed, remove it */ -+ remove_rmap_item_from_tree(tree_rmap_item); -+ } -+ -+ /* The page is gotten and mmap_sem is locked now. */ -+ return err; -+} -+ -+ -+/** -+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the -+ * same hash value. Get its page and trylock the mmap_sem -+ */ -+static inline -+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, -+ u32 hash) -+ -+{ -+ struct rb_node **new = &root_unstable_tree.rb_node; -+ struct rb_node *parent = NULL; -+ struct tree_node *tree_node; -+ u32 hash_max; -+ struct rmap_item *tree_rmap_item; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ /* got the tree_node */ -+ if (tree_node->count == 1) { -+ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, -+ struct rmap_item, node); -+ BUG_ON(!tree_rmap_item); -+ -+ goto get_page_out; -+ } -+ -+ /* well, search the collision subtree */ -+ new = &tree_node->sub_root.rb_node; -+ BUG_ON(!*new); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ -+ while (*new) { -+ int cmp; -+ -+ tree_rmap_item = rb_entry(*new, struct rmap_item, -+ node); -+ -+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); -+ parent = *new; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto get_page_out; -+ } -+ } else { -+ /* alloc a new tree_node */ -+ tree_node = alloc_tree_node(&unstable_tree_node_list); -+ if (!tree_node) -+ return NULL; -+ -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, &root_unstable_tree); -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+ /* did not found even in sub-tree */ -+ rmap_item->tree_node = tree_node; -+ rmap_item->address |= UNSTABLE_FLAG; -+ rmap_item->hash_round = uksm_hash_round; -+ rb_link_node(&rmap_item->node, parent, new); -+ rb_insert_color(&rmap_item->node, &tree_node->sub_root); -+ -+ uksm_pages_unshared++; -+ return NULL; -+ -+get_page_out: -+ if (tree_rmap_item->page == rmap_item->page) -+ return NULL; -+ -+ if (get_tree_rmap_item_page(tree_rmap_item)) -+ return NULL; -+ -+ return tree_rmap_item; -+} -+ -+static void hold_anon_vma(struct rmap_item *rmap_item, -+ struct anon_vma *anon_vma) -+{ -+ rmap_item->anon_vma = anon_vma; -+ get_anon_vma(anon_vma); -+} -+ -+ -+/** -+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication -+ * ratio statistics is done in this function. -+ * -+ */ -+static void stable_tree_append(struct rmap_item *rmap_item, -+ struct stable_node *stable_node, int logdedup) -+{ -+ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; -+ unsigned long key = (unsigned long)rmap_item->slot; -+ unsigned long factor = rmap_item->slot->rung->step; -+ -+ BUG_ON(!stable_node); -+ rmap_item->address |= STABLE_FLAG; -+ -+ if (hlist_empty(&stable_node->hlist)) { -+ uksm_pages_shared++; -+ goto node_vma_new; -+ } else { -+ uksm_pages_sharing++; -+ } -+ -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ if (node_vma->key >= key) -+ break; -+ -+ if (logdedup) { -+ node_vma->slot->pages_bemerged += factor; -+ if (list_empty(&node_vma->slot->dedup_list)) -+ list_add(&node_vma->slot->dedup_list, -+ &vma_slot_dedup); -+ } -+ } -+ -+ if (node_vma) { -+ if (node_vma->key == key) { -+ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); -+ goto node_vma_ok; -+ } else if (node_vma->key > key) { -+ node_vma_cont = node_vma; -+ } -+ } -+ -+node_vma_new: -+ /* no same vma already in node, alloc a new node_vma */ -+ new_node_vma = alloc_node_vma(); -+ BUG_ON(!new_node_vma); -+ new_node_vma->head = stable_node; -+ new_node_vma->slot = rmap_item->slot; -+ -+ if (!node_vma) { -+ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); -+ } else if (node_vma->key != key) { -+ if (node_vma->key < key) -+ hlist_add_after(&node_vma->hlist, &new_node_vma->hlist); -+ else { -+ hlist_add_before(&new_node_vma->hlist, -+ &node_vma->hlist); -+ } -+ -+ } -+ node_vma = new_node_vma; -+ -+node_vma_ok: /* ok, ready to add to the list */ -+ rmap_item->head = node_vma; -+ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); -+ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); -+ if (logdedup) { -+ rmap_item->slot->pages_merged++; -+ if (node_vma_cont) { -+ node_vma = node_vma_cont; -+ hlist_for_each_entry_continue(node_vma, hlist) { -+ node_vma->slot->pages_bemerged += factor; -+ if (list_empty(&node_vma->slot->dedup_list)) -+ list_add(&node_vma->slot->dedup_list, -+ &vma_slot_dedup); -+ } -+ } -+ } -+} -+ -+/* -+ * We use break_ksm to break COW on a ksm page: it's a stripped down -+ * -+ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) -+ * put_page(page); -+ * -+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, -+ * in case the application has unmapped and remapped mm,addr meanwhile. -+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP -+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. -+ */ -+static int break_ksm(struct vm_area_struct *vma, unsigned long addr) -+{ -+ struct page *page; -+ int ret = 0; -+ -+ do { -+ cond_resched(); -+ page = follow_page(vma, addr, FOLL_GET); -+ if (IS_ERR_OR_NULL(page)) -+ break; -+ if (PageKsm(page)) { -+ ret = handle_mm_fault(vma->vm_mm, vma, addr, -+ FAULT_FLAG_WRITE); -+ } else -+ ret = VM_FAULT_WRITE; -+ put_page(page); -+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); -+ /* -+ * We must loop because handle_mm_fault() may back out if there's -+ * any difficulty e.g. if pte accessed bit gets updated concurrently. -+ * -+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that -+ * COW has been broken, even if the vma does not permit VM_WRITE; -+ * but note that a concurrent fault might break PageKsm for us. -+ * -+ * VM_FAULT_SIGBUS could occur if we race with truncation of the -+ * backing file, which also invalidates anonymous pages: that's -+ * okay, that truncation will have unmapped the PageKsm for us. -+ * -+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting -+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the -+ * current task has TIF_MEMDIE set, and will be OOM killed on return -+ * to user; and ksmd, having no mm, would never be chosen for that. -+ * -+ * But if the mm is in a limited mem_cgroup, then the fault may fail -+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and -+ * even ksmd can fail in this way - though it's usually breaking ksm -+ * just to undo a merge it made a moment before, so unlikely to oom. -+ * -+ * That's a pity: we might therefore have more kernel pages allocated -+ * than we're counting as nodes in the stable tree; but uksm_do_scan -+ * will retry to break_cow on each pass, so should recover the page -+ * in due course. The important thing is to not let VM_MERGEABLE -+ * be cleared while any such pages might remain in the area. -+ */ -+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; -+} -+ -+static void break_cow(struct rmap_item *rmap_item) -+{ -+ struct vm_area_struct *vma = rmap_item->slot->vma; -+ struct mm_struct *mm = vma->vm_mm; -+ unsigned long addr = get_rmap_addr(rmap_item); -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ break_ksm(vma, addr); -+out: -+ return; -+} -+ -+/* -+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather -+ * than check every pte of a given vma, the locking doesn't quite work for -+ * that - an rmap_item is assigned to the stable tree after inserting ksm -+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing -+ * rmap_items from parent to child at fork time (so as not to waste time -+ * if exit comes before the next scan reaches it). -+ * -+ * Similarly, although we'd like to remove rmap_items (so updating counts -+ * and freeing memory) when unmerging an area, it's easier to leave that -+ * to the next pass of ksmd - consider, for example, how ksmd might be -+ * in cmp_and_merge_page on one of the rmap_items we would be removing. -+ */ -+inline int unmerge_uksm_pages(struct vm_area_struct *vma, -+ unsigned long start, unsigned long end) -+{ -+ unsigned long addr; -+ int err = 0; -+ -+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { -+ if (uksm_test_exit(vma->vm_mm)) -+ break; -+ if (signal_pending(current)) -+ err = -ERESTARTSYS; -+ else -+ err = break_ksm(vma, addr); -+ } -+ return err; -+} -+ -+static inline void inc_uksm_pages_scanned(void) -+{ -+ u64 delta; -+ -+ -+ if (uksm_pages_scanned == U64_MAX) { -+ encode_benefit(); -+ -+ delta = uksm_pages_scanned >> pages_scanned_base; -+ -+ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { -+ pages_scanned_stored >>= 1; -+ delta >>= 1; -+ pages_scanned_base++; -+ } -+ -+ pages_scanned_stored += delta; -+ -+ uksm_pages_scanned = uksm_pages_scanned_last = 0; -+ } -+ -+ uksm_pages_scanned++; -+} -+ -+static inline int find_zero_page_hash(int strength, u32 hash) -+{ -+ return (zero_hash_table[strength] == hash); -+} -+ -+static -+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) -+{ -+ struct page *zero_page = empty_uksm_zero_page; -+ struct mm_struct *mm = vma->vm_mm; -+ pte_t orig_pte = __pte(0); -+ int err = -EFAULT; -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) -+ goto out; -+ BUG_ON(PageTransCompound(page)); -+ -+ if (!PageAnon(page)) -+ goto out; -+ -+ if (!trylock_page(page)) -+ goto out; -+ -+ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { -+ if (is_page_full_zero(page)) -+ err = replace_page(vma, page, zero_page, orig_pte); -+ } -+ -+ unlock_page(page); -+out: -+ return err; -+} -+ -+/* -+ * cmp_and_merge_page() - first see if page can be merged into the stable -+ * tree; if not, compare hash to previous and if it's the same, see if page -+ * can be inserted into the unstable tree, or merged with a page already there -+ * and both transferred to the stable tree. -+ * -+ * @page: the page that we are searching identical page to. -+ * @rmap_item: the reverse mapping into the virtual address of this page -+ */ -+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) -+{ -+ struct rmap_item *tree_rmap_item; -+ struct page *page; -+ struct page *kpage = NULL; -+ u32 hash_max; -+ int err; -+ unsigned int success1, success2; -+ struct stable_node *snode; -+ int cmp; -+ struct rb_node *parent = NULL, **new; -+ -+ remove_rmap_item_from_tree(rmap_item); -+ page = rmap_item->page; -+ -+ /* We first start with searching the page inside the stable tree */ -+ kpage = stable_tree_search(rmap_item, hash); -+ if (kpage) { -+ err = try_to_merge_with_uksm_page(rmap_item, kpage, -+ hash); -+ if (!err) { -+ /* -+ * The page was successfully merged, add -+ * its rmap_item to the stable tree. -+ * page lock is needed because it's -+ * racing with try_to_unmap_ksm(), etc. -+ */ -+ lock_page(kpage); -+ snode = page_stable_node(kpage); -+ stable_tree_append(rmap_item, snode, 1); -+ unlock_page(kpage); -+ put_page(kpage); -+ return; /* success */ -+ } -+ put_page(kpage); -+ -+ /* -+ * if it's a collision and it has been search in sub-rbtree -+ * (hash_max != 0), we want to abort, because if it is -+ * successfully merged in unstable tree, the collision trends to -+ * happen again. -+ */ -+ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) -+ return; -+ } -+ -+ tree_rmap_item = -+ unstable_tree_search_insert(rmap_item, hash); -+ if (tree_rmap_item) { -+ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); -+ /* -+ * As soon as we merge this page, we want to remove the -+ * rmap_item of the page we have merged with from the unstable -+ * tree, and insert it instead as new node in the stable tree. -+ */ -+ if (!err) { -+ kpage = page; -+ remove_rmap_item_from_tree(tree_rmap_item); -+ lock_page(kpage); -+ snode = stable_tree_insert(&kpage, hash, -+ rmap_item, tree_rmap_item, -+ &success1, &success2); -+ -+ /* -+ * Do not log dedup for tree item, it's not counted as -+ * scanned in this round. -+ */ -+ if (success2) -+ stable_tree_append(tree_rmap_item, snode, 0); -+ -+ /* -+ * The order of these two stable append is important: -+ * we are scanning rmap_item. -+ */ -+ if (success1) -+ stable_tree_append(rmap_item, snode, 1); -+ -+ /* -+ * The original kpage may be unlocked inside -+ * stable_tree_insert() already. This page -+ * should be unlocked before doing -+ * break_cow(). -+ */ -+ unlock_page(kpage); -+ -+ if (!success1) -+ break_cow(rmap_item); -+ -+ if (!success2) -+ break_cow(tree_rmap_item); -+ -+ } else if (err == MERGE_ERR_COLLI) { -+ BUG_ON(tree_rmap_item->tree_node->count > 1); -+ -+ rmap_item_hash_max(tree_rmap_item, -+ tree_rmap_item->tree_node->hash); -+ -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); -+ parent = &tree_rmap_item->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto put_up_out; -+ -+ rmap_item->tree_node = tree_rmap_item->tree_node; -+ rmap_item->address |= UNSTABLE_FLAG; -+ rmap_item->hash_round = uksm_hash_round; -+ rb_link_node(&rmap_item->node, parent, new); -+ rb_insert_color(&rmap_item->node, -+ &tree_rmap_item->tree_node->sub_root); -+ rmap_item->tree_node->count++; -+ } else { -+ /* -+ * either one of the page has changed or they collide -+ * at the max hash, we consider them as ill items. -+ */ -+ remove_rmap_item_from_tree(tree_rmap_item); -+ } -+put_up_out: -+ put_page(tree_rmap_item->page); -+ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); -+ } -+} -+ -+ -+ -+ -+static inline unsigned long get_pool_index(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; -+ if (pool_index >= slot->pool_size) -+ BUG(); -+ return pool_index; -+} -+ -+static inline unsigned long index_page_offset(unsigned long index) -+{ -+ return offset_in_page(sizeof(struct rmap_list_entry *) * index); -+} -+ -+static inline -+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, -+ unsigned long index, int need_alloc) -+{ -+ unsigned long pool_index; -+ struct page *page; -+ void *addr; -+ -+ -+ pool_index = get_pool_index(slot, index); -+ if (!slot->rmap_list_pool[pool_index]) { -+ if (!need_alloc) -+ return NULL; -+ -+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); -+ if (!page) -+ return NULL; -+ -+ slot->rmap_list_pool[pool_index] = page; -+ } -+ -+ addr = kmap(slot->rmap_list_pool[pool_index]); -+ addr += index_page_offset(index); -+ -+ return addr; -+} -+ -+static inline void put_rmap_list_entry(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ kunmap(slot->rmap_list_pool[pool_index]); -+} -+ -+static inline int entry_is_new(struct rmap_list_entry *entry) -+{ -+ return !entry->item; -+} -+ -+static inline unsigned long get_index_orig_addr(struct vma_slot *slot, -+ unsigned long index) -+{ -+ return slot->vma->vm_start + (index << PAGE_SHIFT); -+} -+ -+static inline unsigned long get_entry_address(struct rmap_list_entry *entry) -+{ -+ unsigned long addr; -+ -+ if (is_addr(entry->addr)) -+ addr = get_clean_addr(entry->addr); -+ else if (entry->item) -+ addr = get_rmap_addr(entry->item); -+ else -+ BUG(); -+ -+ return addr; -+} -+ -+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) -+{ -+ if (is_addr(entry->addr)) -+ return NULL; -+ -+ return entry->item; -+} -+ -+static inline void inc_rmap_list_pool_count(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ slot->pool_counts[pool_index]++; -+} -+ -+static inline void dec_rmap_list_pool_count(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ BUG_ON(!slot->pool_counts[pool_index]); -+ slot->pool_counts[pool_index]--; -+} -+ -+static inline int entry_has_rmap(struct rmap_list_entry *entry) -+{ -+ return !is_addr(entry->addr) && entry->item; -+} -+ -+static inline void swap_entries(struct rmap_list_entry *entry1, -+ unsigned long index1, -+ struct rmap_list_entry *entry2, -+ unsigned long index2) -+{ -+ struct rmap_list_entry tmp; -+ -+ /* swapping two new entries is meaningless */ -+ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); -+ -+ tmp = *entry1; -+ *entry1 = *entry2; -+ *entry2 = tmp; -+ -+ if (entry_has_rmap(entry1)) -+ entry1->item->entry_index = index1; -+ -+ if (entry_has_rmap(entry2)) -+ entry2->item->entry_index = index2; -+ -+ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { -+ inc_rmap_list_pool_count(entry1->item->slot, index1); -+ dec_rmap_list_pool_count(entry1->item->slot, index2); -+ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { -+ inc_rmap_list_pool_count(entry2->item->slot, index2); -+ dec_rmap_list_pool_count(entry2->item->slot, index1); -+ } -+} -+ -+static inline void free_entry_item(struct rmap_list_entry *entry) -+{ -+ unsigned long index; -+ struct rmap_item *item; -+ -+ if (!is_addr(entry->addr)) { -+ BUG_ON(!entry->item); -+ item = entry->item; -+ entry->addr = get_rmap_addr(item); -+ set_is_addr(entry->addr); -+ index = item->entry_index; -+ remove_rmap_item_from_tree(item); -+ dec_rmap_list_pool_count(item->slot, index); -+ free_rmap_item(item); -+ } -+} -+ -+static inline int pool_entry_boundary(unsigned long index) -+{ -+ unsigned long linear_addr; -+ -+ linear_addr = sizeof(struct rmap_list_entry *) * index; -+ return index && !offset_in_page(linear_addr); -+} -+ -+static inline void try_free_last_pool(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ if (slot->rmap_list_pool[pool_index] && -+ !slot->pool_counts[pool_index]) { -+ __free_page(slot->rmap_list_pool[pool_index]); -+ slot->rmap_list_pool[pool_index] = NULL; -+ slot->flags |= UKSM_SLOT_NEED_SORT; -+ } -+ -+} -+ -+static inline unsigned long vma_item_index(struct vm_area_struct *vma, -+ struct rmap_item *item) -+{ -+ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; -+} -+ -+static int within_same_pool(struct vma_slot *slot, -+ unsigned long i, unsigned long j) -+{ -+ unsigned long pool_i, pool_j; -+ -+ pool_i = get_pool_index(slot, i); -+ pool_j = get_pool_index(slot, j); -+ -+ return (pool_i == pool_j); -+} -+ -+static void sort_rmap_entry_list(struct vma_slot *slot) -+{ -+ unsigned long i, j; -+ struct rmap_list_entry *entry, *swap_entry; -+ -+ entry = get_rmap_list_entry(slot, 0, 0); -+ for (i = 0; i < slot->pages; ) { -+ -+ if (!entry) -+ goto skip_whole_pool; -+ -+ if (entry_is_new(entry)) -+ goto next_entry; -+ -+ if (is_addr(entry->addr)) { -+ entry->addr = 0; -+ goto next_entry; -+ } -+ -+ j = vma_item_index(slot->vma, entry->item); -+ if (j == i) -+ goto next_entry; -+ -+ if (within_same_pool(slot, i, j)) -+ swap_entry = entry + j - i; -+ else -+ swap_entry = get_rmap_list_entry(slot, j, 1); -+ -+ swap_entries(entry, i, swap_entry, j); -+ if (!within_same_pool(slot, i, j)) -+ put_rmap_list_entry(slot, j); -+ continue; -+ -+skip_whole_pool: -+ i += PAGE_SIZE / sizeof(*entry); -+ if (i < slot->pages) -+ entry = get_rmap_list_entry(slot, i, 0); -+ continue; -+ -+next_entry: -+ if (i >= slot->pages - 1 || -+ !within_same_pool(slot, i, i + 1)) { -+ put_rmap_list_entry(slot, i); -+ if (i + 1 < slot->pages) -+ entry = get_rmap_list_entry(slot, i + 1, 0); -+ } else -+ entry++; -+ i++; -+ continue; -+ } -+ -+ /* free empty pool entries which contain no rmap_item */ -+ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ -+ for (i = 0; i < slot->pool_size; i++) { -+ unsigned char has_rmap; -+ void *addr; -+ -+ if (!slot->rmap_list_pool[i]) -+ continue; -+ -+ has_rmap = 0; -+ addr = kmap(slot->rmap_list_pool[i]); -+ BUG_ON(!addr); -+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { -+ entry = (struct rmap_list_entry *)addr + j; -+ if (is_addr(entry->addr)) -+ continue; -+ if (!entry->item) -+ continue; -+ has_rmap = 1; -+ } -+ kunmap(slot->rmap_list_pool[i]); -+ if (!has_rmap) { -+ BUG_ON(slot->pool_counts[i]); -+ __free_page(slot->rmap_list_pool[i]); -+ slot->rmap_list_pool[i] = NULL; -+ } -+ } -+ -+ slot->flags &= ~UKSM_SLOT_NEED_SORT; -+} -+ -+/* -+ * vma_fully_scanned() - if all the pages in this slot have been scanned. -+ */ -+static inline int vma_fully_scanned(struct vma_slot *slot) -+{ -+ return slot->pages_scanned == slot->pages; -+} -+ -+/** -+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to -+ * its random permutation. This function is embedded with the random -+ * permutation index management code. -+ */ -+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) -+{ -+ unsigned long rand_range, addr, swap_index, scan_index; -+ struct rmap_item *item = NULL; -+ struct rmap_list_entry *scan_entry, *swap_entry = NULL; -+ struct page *page; -+ -+ scan_index = swap_index = slot->pages_scanned % slot->pages; -+ -+ if (pool_entry_boundary(scan_index)) -+ try_free_last_pool(slot, scan_index - 1); -+ -+ if (vma_fully_scanned(slot)) { -+ if (slot->flags & UKSM_SLOT_NEED_SORT) -+ slot->flags |= UKSM_SLOT_NEED_RERAND; -+ else -+ slot->flags &= ~UKSM_SLOT_NEED_RERAND; -+ if (slot->flags & UKSM_SLOT_NEED_SORT) -+ sort_rmap_entry_list(slot); -+ } -+ -+ scan_entry = get_rmap_list_entry(slot, scan_index, 1); -+ if (!scan_entry) -+ return NULL; -+ -+ if (entry_is_new(scan_entry)) { -+ scan_entry->addr = get_index_orig_addr(slot, scan_index); -+ set_is_addr(scan_entry->addr); -+ } -+ -+ if (slot->flags & UKSM_SLOT_NEED_RERAND) { -+ rand_range = slot->pages - scan_index; -+ BUG_ON(!rand_range); -+ swap_index = scan_index + (prandom_u32() % rand_range); -+ } -+ -+ if (swap_index != scan_index) { -+ swap_entry = get_rmap_list_entry(slot, swap_index, 1); -+ if (entry_is_new(swap_entry)) { -+ swap_entry->addr = get_index_orig_addr(slot, -+ swap_index); -+ set_is_addr(swap_entry->addr); -+ } -+ swap_entries(scan_entry, scan_index, swap_entry, swap_index); -+ } -+ -+ addr = get_entry_address(scan_entry); -+ item = get_entry_item(scan_entry); -+ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); -+ -+ page = follow_page(slot->vma, addr, FOLL_GET); -+ if (IS_ERR_OR_NULL(page)) -+ goto nopage; -+ -+ if (!PageAnon(page) && !page_trans_compound_anon(page)) -+ goto putpage; -+ -+ /*check is zero_page pfn or uksm_zero_page*/ -+ if ((page_to_pfn(page) == zero_pfn) -+ || (page_to_pfn(page) == uksm_zero_pfn)) -+ goto putpage; -+ -+ flush_anon_page(slot->vma, page, addr); -+ flush_dcache_page(page); -+ -+ -+ *hash = page_hash(page, hash_strength, 1); -+ inc_uksm_pages_scanned(); -+ /*if the page content all zero, re-map to zero-page*/ -+ if (find_zero_page_hash(hash_strength, *hash)) { -+ if (!cmp_and_merge_zero_page(slot->vma, page)) { -+ slot->pages_merged++; -+ __inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); -+ dec_mm_counter(slot->mm, MM_ANONPAGES); -+ -+ /* For full-zero pages, no need to create rmap item */ -+ goto putpage; -+ } else { -+ inc_rshash_neg(memcmp_cost / 2); -+ } -+ } -+ -+ if (!item) { -+ item = alloc_rmap_item(); -+ if (item) { -+ /* It has already been zeroed */ -+ item->slot = slot; -+ item->address = addr; -+ item->entry_index = scan_index; -+ scan_entry->item = item; -+ inc_rmap_list_pool_count(slot, scan_index); -+ } else -+ goto putpage; -+ } -+ -+ BUG_ON(item->slot != slot); -+ /* the page may have changed */ -+ item->page = page; -+ put_rmap_list_entry(slot, scan_index); -+ if (swap_entry) -+ put_rmap_list_entry(slot, swap_index); -+ return item; -+ -+putpage: -+ put_page(page); -+ page = NULL; -+nopage: -+ /* no page, store addr back and free rmap_item if possible */ -+ free_entry_item(scan_entry); -+ put_rmap_list_entry(slot, scan_index); -+ if (swap_entry) -+ put_rmap_list_entry(slot, swap_index); -+ return NULL; -+} -+ -+static inline int in_stable_tree(struct rmap_item *rmap_item) -+{ -+ return rmap_item->address & STABLE_FLAG; -+} -+ -+/** -+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with -+ * mmap_sem locked. -+ */ -+static noinline void scan_vma_one_page(struct vma_slot *slot) -+{ -+ u32 hash; -+ struct mm_struct *mm; -+ struct rmap_item *rmap_item = NULL; -+ struct vm_area_struct *vma = slot->vma; -+ -+ mm = vma->vm_mm; -+ BUG_ON(!mm); -+ BUG_ON(!slot); -+ -+ rmap_item = get_next_rmap_item(slot, &hash); -+ if (!rmap_item) -+ goto out1; -+ -+ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) -+ goto out2; -+ -+ cmp_and_merge_page(rmap_item, hash); -+out2: -+ put_page(rmap_item->page); -+out1: -+ slot->pages_scanned++; -+ if (slot->fully_scanned_round != fully_scanned_round) -+ scanned_virtual_pages++; -+ -+ if (vma_fully_scanned(slot)) -+ slot->fully_scanned_round = fully_scanned_round; -+} -+ -+static inline unsigned long rung_get_pages(struct scan_rung *rung) -+{ -+ struct slot_tree_node *node; -+ -+ if (!rung->vma_root.rnode) -+ return 0; -+ -+ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); -+ -+ return node->size; -+} -+ -+#define RUNG_SAMPLED_MIN 3 -+ -+static inline -+void uksm_calc_rung_step(struct scan_rung *rung, -+ unsigned long page_time, unsigned long ratio) -+{ -+ unsigned long sampled, pages; -+ -+ /* will be fully scanned ? */ -+ if (!rung->cover_msecs) { -+ rung->step = 1; -+ return; -+ } -+ -+ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) -+ * ratio / page_time; -+ -+ /* -+ * Before we finsish a scan round and expensive per-round jobs, -+ * we need to have a chance to estimate the per page time. So -+ * the sampled number can not be too small. -+ */ -+ if (sampled < RUNG_SAMPLED_MIN) -+ sampled = RUNG_SAMPLED_MIN; -+ -+ pages = rung_get_pages(rung); -+ if (likely(pages > sampled)) -+ rung->step = pages / sampled; -+ else -+ rung->step = 1; -+} -+ -+static inline int step_need_recalc(struct scan_rung *rung) -+{ -+ unsigned long pages, stepmax; -+ -+ pages = rung_get_pages(rung); -+ stepmax = pages / RUNG_SAMPLED_MIN; -+ -+ return pages && (rung->step > pages || -+ (stepmax && rung->step > stepmax)); -+} -+ -+static inline -+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) -+{ -+ struct vma_slot *slot; -+ -+ if (finished) -+ rung->flags |= UKSM_RUNG_ROUND_FINISHED; -+ -+ if (step_recalc || step_need_recalc(rung)) { -+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); -+ BUG_ON(step_need_recalc(rung)); -+ } -+ -+ slot_iter_index = prandom_u32() % rung->step; -+ BUG_ON(!rung->vma_root.rnode); -+ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); -+ BUG_ON(!slot); -+ -+ rung->current_scan = slot; -+ rung->current_offset = slot_iter_index; -+} -+ -+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) -+{ -+ return &slot->rung->vma_root; -+} -+ -+/* -+ * return if resetted. -+ */ -+static int advance_current_scan(struct scan_rung *rung) -+{ -+ unsigned short n; -+ struct vma_slot *slot, *next = NULL; -+ -+ BUG_ON(!rung->vma_root.num); -+ -+ slot = rung->current_scan; -+ n = (slot->pages - rung->current_offset) % rung->step; -+ slot_iter_index = rung->step - n; -+ next = sradix_tree_next(&rung->vma_root, slot->snode, -+ slot->sindex, slot_iter); -+ -+ if (next) { -+ rung->current_offset = slot_iter_index; -+ rung->current_scan = next; -+ return 0; -+ } else { -+ reset_current_scan(rung, 1, 0); -+ return 1; -+ } -+} -+ -+static inline void rung_rm_slot(struct vma_slot *slot) -+{ -+ struct scan_rung *rung = slot->rung; -+ struct sradix_tree_root *root; -+ -+ if (rung->current_scan == slot) -+ advance_current_scan(rung); -+ -+ root = slot_get_root(slot); -+ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); -+ slot->snode = NULL; -+ if (step_need_recalc(rung)) { -+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); -+ BUG_ON(step_need_recalc(rung)); -+ } -+ -+ /* In case advance_current_scan loop back to this slot again */ -+ if (rung->vma_root.num && rung->current_scan == slot) -+ reset_current_scan(slot->rung, 1, 0); -+} -+ -+static inline void rung_add_new_slots(struct scan_rung *rung, -+ struct vma_slot **slots, unsigned long num) -+{ -+ int err; -+ struct vma_slot *slot; -+ unsigned long i; -+ struct sradix_tree_root *root = &rung->vma_root; -+ -+ err = sradix_tree_enter(root, (void **)slots, num); -+ BUG_ON(err); -+ -+ for (i = 0; i < num; i++) { -+ slot = slots[i]; -+ slot->rung = rung; -+ BUG_ON(vma_fully_scanned(slot)); -+ } -+ -+ if (rung->vma_root.num == num) -+ reset_current_scan(rung, 0, 1); -+} -+ -+static inline int rung_add_one_slot(struct scan_rung *rung, -+ struct vma_slot *slot) -+{ -+ int err; -+ -+ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); -+ if (err) -+ return err; -+ -+ slot->rung = rung; -+ if (rung->vma_root.num == 1) -+ reset_current_scan(rung, 0, 1); -+ -+ return 0; -+} -+ -+/* -+ * Return true if the slot is deleted from its rung. -+ */ -+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) -+{ -+ struct scan_rung *old_rung = slot->rung; -+ int err; -+ -+ if (old_rung == rung) -+ return 0; -+ -+ rung_rm_slot(slot); -+ err = rung_add_one_slot(rung, slot); -+ if (err) { -+ err = rung_add_one_slot(old_rung, slot); -+ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ -+ } -+ -+ return 1; -+} -+ -+static inline int vma_rung_up(struct vma_slot *slot) -+{ -+ struct scan_rung *rung; -+ -+ rung = slot->rung; -+ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) -+ rung++; -+ -+ return vma_rung_enter(slot, rung); -+} -+ -+static inline int vma_rung_down(struct vma_slot *slot) -+{ -+ struct scan_rung *rung; -+ -+ rung = slot->rung; -+ if (slot->rung != &uksm_scan_ladder[0]) -+ rung--; -+ -+ return vma_rung_enter(slot, rung); -+} -+ -+/** -+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. -+ */ -+static unsigned long cal_dedup_ratio(struct vma_slot *slot) -+{ -+ unsigned long ret; -+ -+ BUG_ON(slot->pages_scanned == slot->last_scanned); -+ -+ ret = slot->pages_merged; -+ -+ /* Thrashing area filtering */ -+ if (ret && uksm_thrash_threshold) { -+ if (slot->pages_cowed * 100 / slot->pages_merged -+ > uksm_thrash_threshold) { -+ ret = 0; -+ } else { -+ ret = slot->pages_merged - slot->pages_cowed; -+ } -+ } -+ -+ return ret; -+} -+ -+/** -+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. -+ */ -+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) -+{ -+ unsigned long ret; -+ unsigned long pages_scanned; -+ -+ pages_scanned = slot->pages_scanned; -+ if (!pages_scanned) { -+ if (uksm_thrash_threshold) -+ return 0; -+ else -+ pages_scanned = slot->pages_scanned; -+ } -+ -+ ret = slot->pages_bemerged * 100 / pages_scanned; -+ -+ /* Thrashing area filtering */ -+ if (ret && uksm_thrash_threshold) { -+ if (slot->pages_cowed * 100 / slot->pages_bemerged -+ > uksm_thrash_threshold) { -+ ret = 0; -+ } else { -+ ret = slot->pages_bemerged - slot->pages_cowed; -+ } -+ } -+ -+ return ret; -+} -+ -+/** -+ * stable_node_reinsert() - When the hash_strength has been adjusted, the -+ * stable tree need to be restructured, this is the function re-inserting the -+ * stable node. -+ */ -+static inline void stable_node_reinsert(struct stable_node *new_node, -+ struct page *page, -+ struct rb_root *root_treep, -+ struct list_head *tree_node_listp, -+ u32 hash) -+{ -+ struct rb_node **new = &root_treep->rb_node; -+ struct rb_node *parent = NULL; -+ struct stable_node *stable_node; -+ struct tree_node *tree_node; -+ struct page *tree_page; -+ int cmp; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ /* find a stable tree node with same first level hash value */ -+ stable_node_hash_max(new_node, page, hash); -+ if (tree_node->count == 1) { -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ stable_node_hash_max(stable_node, -+ tree_page, hash); -+ put_page(tree_page); -+ -+ /* prepare for stable node insertion */ -+ -+ cmp = hash_cmp(new_node->hash_max, -+ stable_node->hash_max); -+ parent = &stable_node->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto failed; -+ -+ goto add_node; -+ } else { -+ /* the only stable_node deleted, the tree node -+ * was not deleted. -+ */ -+ goto tree_node_reuse; -+ } -+ } -+ -+ /* well, search the collision subtree */ -+ new = &tree_node->sub_root.rb_node; -+ parent = NULL; -+ BUG_ON(!*new); -+ while (*new) { -+ int cmp; -+ -+ stable_node = rb_entry(*new, struct stable_node, node); -+ -+ cmp = hash_cmp(new_node->hash_max, -+ stable_node->hash_max); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else { -+ /* oh, no, still a collision */ -+ goto failed; -+ } -+ } -+ -+ goto add_node; -+ } -+ -+ /* no tree node found */ -+ tree_node = alloc_tree_node(tree_node_listp); -+ if (!tree_node) { -+ printk(KERN_ERR "UKSM: memory allocation error!\n"); -+ goto failed; -+ } else { -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, root_treep); -+ -+tree_node_reuse: -+ /* prepare for stable node insertion */ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+add_node: -+ rb_link_node(&new_node->node, parent, new); -+ rb_insert_color(&new_node->node, &tree_node->sub_root); -+ new_node->tree_node = tree_node; -+ tree_node->count++; -+ return; -+ -+failed: -+ /* This can only happen when two nodes have collided -+ * in two levels. -+ */ -+ new_node->tree_node = NULL; -+ return; -+} -+ -+static inline void free_all_tree_nodes(struct list_head *list) -+{ -+ struct tree_node *node, *tmp; -+ -+ list_for_each_entry_safe(node, tmp, list, all_list) { -+ free_tree_node(node); -+ } -+} -+ -+/** -+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash -+ * strength to the current hash_strength. It re-structures the hole tree. -+ */ -+static inline void stable_tree_delta_hash(u32 prev_hash_strength) -+{ -+ struct stable_node *node, *tmp; -+ struct rb_root *root_new_treep; -+ struct list_head *new_tree_node_listp; -+ -+ stable_tree_index = (stable_tree_index + 1) % 2; -+ root_new_treep = &root_stable_tree[stable_tree_index]; -+ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; -+ *root_new_treep = RB_ROOT; -+ BUG_ON(!list_empty(new_tree_node_listp)); -+ -+ /* -+ * we need to be safe, the node could be removed by get_uksm_page() -+ */ -+ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { -+ void *addr; -+ struct page *node_page; -+ u32 hash; -+ -+ /* -+ * We are completely re-structuring the stable nodes to a new -+ * stable tree. We don't want to touch the old tree unlinks and -+ * old tree_nodes. The old tree_nodes will be freed at once. -+ */ -+ node_page = get_uksm_page(node, 0, 0); -+ if (!node_page) -+ continue; -+ -+ if (node->tree_node) { -+ hash = node->tree_node->hash; -+ -+ addr = kmap_atomic(node_page); -+ -+ hash = delta_hash(addr, prev_hash_strength, -+ hash_strength, hash); -+ kunmap_atomic(addr); -+ } else { -+ /* -+ *it was not inserted to rbtree due to collision in last -+ *round scan. -+ */ -+ hash = page_hash(node_page, hash_strength, 0); -+ } -+ -+ stable_node_reinsert(node, node_page, root_new_treep, -+ new_tree_node_listp, hash); -+ put_page(node_page); -+ } -+ -+ root_stable_treep = root_new_treep; -+ free_all_tree_nodes(stable_tree_node_listp); -+ BUG_ON(!list_empty(stable_tree_node_listp)); -+ stable_tree_node_listp = new_tree_node_listp; -+} -+ -+static inline void inc_hash_strength(unsigned long delta) -+{ -+ hash_strength += 1 << delta; -+ if (hash_strength > HASH_STRENGTH_MAX) -+ hash_strength = HASH_STRENGTH_MAX; -+} -+ -+static inline void dec_hash_strength(unsigned long delta) -+{ -+ unsigned long change = 1 << delta; -+ -+ if (hash_strength <= change + 1) -+ hash_strength = 1; -+ else -+ hash_strength -= change; -+} -+ -+static inline void inc_hash_strength_delta(void) -+{ -+ hash_strength_delta++; -+ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) -+ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; -+} -+ -+/* -+static inline unsigned long get_current_neg_ratio(void) -+{ -+ if (!rshash_pos || rshash_neg > rshash_pos) -+ return 100; -+ -+ return div64_u64(100 * rshash_neg , rshash_pos); -+} -+*/ -+ -+static inline unsigned long get_current_neg_ratio(void) -+{ -+ u64 pos = benefit.pos; -+ u64 neg = benefit.neg; -+ -+ if (!neg) -+ return 0; -+ -+ if (!pos || neg > pos) -+ return 100; -+ -+ if (neg > div64_u64(U64_MAX, 100)) -+ pos = div64_u64(pos, 100); -+ else -+ neg *= 100; -+ -+ return div64_u64(neg, pos); -+} -+ -+static inline unsigned long get_current_benefit(void) -+{ -+ u64 pos = benefit.pos; -+ u64 neg = benefit.neg; -+ u64 scanned = benefit.scanned; -+ -+ if (neg > pos) -+ return 0; -+ -+ return div64_u64((pos - neg), scanned); -+} -+ -+static inline int judge_rshash_direction(void) -+{ -+ u64 current_neg_ratio, stable_benefit; -+ u64 current_benefit, delta = 0; -+ int ret = STILL; -+ -+ /* Try to probe a value after the boot, and in case the system -+ are still for a long time. */ -+ if ((fully_scanned_round & 0xFFULL) == 10) { -+ ret = OBSCURE; -+ goto out; -+ } -+ -+ current_neg_ratio = get_current_neg_ratio(); -+ -+ if (current_neg_ratio == 0) { -+ rshash_neg_cont_zero++; -+ if (rshash_neg_cont_zero > 2) -+ return GO_DOWN; -+ else -+ return STILL; -+ } -+ rshash_neg_cont_zero = 0; -+ -+ if (current_neg_ratio > 90) { -+ ret = GO_UP; -+ goto out; -+ } -+ -+ current_benefit = get_current_benefit(); -+ stable_benefit = rshash_state.stable_benefit; -+ -+ if (!stable_benefit) { -+ ret = OBSCURE; -+ goto out; -+ } -+ -+ if (current_benefit > stable_benefit) -+ delta = current_benefit - stable_benefit; -+ else if (current_benefit < stable_benefit) -+ delta = stable_benefit - current_benefit; -+ -+ delta = div64_u64(100 * delta , stable_benefit); -+ -+ if (delta > 50) { -+ rshash_cont_obscure++; -+ if (rshash_cont_obscure > 2) -+ return OBSCURE; -+ else -+ return STILL; -+ } -+ -+out: -+ rshash_cont_obscure = 0; -+ return ret; -+} -+ -+/** -+ * rshash_adjust() - The main function to control the random sampling state -+ * machine for hash strength adapting. -+ * -+ * return true if hash_strength has changed. -+ */ -+static inline int rshash_adjust(void) -+{ -+ unsigned long prev_hash_strength = hash_strength; -+ -+ if (!encode_benefit()) -+ return 0; -+ -+ switch (rshash_state.state) { -+ case RSHASH_STILL: -+ switch (judge_rshash_direction()) { -+ case GO_UP: -+ if (rshash_state.pre_direct == GO_DOWN) -+ hash_strength_delta = 0; -+ -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.pre_direct = GO_UP; -+ break; -+ -+ case GO_DOWN: -+ if (rshash_state.pre_direct == GO_UP) -+ hash_strength_delta = 0; -+ -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.pre_direct = GO_DOWN; -+ break; -+ -+ case OBSCURE: -+ rshash_state.stable_point = hash_strength; -+ rshash_state.turn_point_down = hash_strength; -+ rshash_state.turn_point_up = hash_strength; -+ rshash_state.turn_benefit_down = get_current_benefit(); -+ rshash_state.turn_benefit_up = get_current_benefit(); -+ rshash_state.lookup_window_index = 0; -+ rshash_state.state = RSHASH_TRYDOWN; -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ break; -+ -+ case STILL: -+ break; -+ default: -+ BUG(); -+ } -+ break; -+ -+ case RSHASH_TRYDOWN: -+ if (rshash_state.lookup_window_index++ % 5 == 0) -+ rshash_state.below_count = 0; -+ -+ if (get_current_benefit() < rshash_state.stable_benefit) -+ rshash_state.below_count++; -+ else if (get_current_benefit() > -+ rshash_state.turn_benefit_down) { -+ rshash_state.turn_point_down = hash_strength; -+ rshash_state.turn_benefit_down = get_current_benefit(); -+ } -+ -+ if (rshash_state.below_count >= 3 || -+ judge_rshash_direction() == GO_UP || -+ hash_strength == 1) { -+ hash_strength = rshash_state.stable_point; -+ hash_strength_delta = 0; -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.lookup_window_index = 0; -+ rshash_state.state = RSHASH_TRYUP; -+ hash_strength_delta = 0; -+ } else { -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ } -+ break; -+ -+ case RSHASH_TRYUP: -+ if (rshash_state.lookup_window_index++ % 5 == 0) -+ rshash_state.below_count = 0; -+ -+ if (get_current_benefit() < rshash_state.turn_benefit_down) -+ rshash_state.below_count++; -+ else if (get_current_benefit() > rshash_state.turn_benefit_up) { -+ rshash_state.turn_point_up = hash_strength; -+ rshash_state.turn_benefit_up = get_current_benefit(); -+ } -+ -+ if (rshash_state.below_count >= 3 || -+ judge_rshash_direction() == GO_DOWN || -+ hash_strength == HASH_STRENGTH_MAX) { -+ hash_strength = rshash_state.turn_benefit_up > -+ rshash_state.turn_benefit_down ? -+ rshash_state.turn_point_up : -+ rshash_state.turn_point_down; -+ -+ rshash_state.state = RSHASH_PRE_STILL; -+ } else { -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ } -+ -+ break; -+ -+ case RSHASH_NEW: -+ case RSHASH_PRE_STILL: -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.state = RSHASH_STILL; -+ hash_strength_delta = 0; -+ break; -+ default: -+ BUG(); -+ } -+ -+ /* rshash_neg = rshash_pos = 0; */ -+ reset_benefit(); -+ -+ if (prev_hash_strength != hash_strength) -+ stable_tree_delta_hash(prev_hash_strength); -+ -+ return prev_hash_strength != hash_strength; -+} -+ -+/** -+ * round_update_ladder() - The main function to do update of all the -+ * adjustments whenever a scan round is finished. -+ */ -+static noinline void round_update_ladder(void) -+{ -+ int i; -+ unsigned long dedup; -+ struct vma_slot *slot, *tmp_slot; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; -+ } -+ -+ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { -+ -+ /* slot may be rung_rm_slot() when mm exits */ -+ if (slot->snode) { -+ dedup = cal_dedup_ratio_old(slot); -+ if (dedup && dedup >= uksm_abundant_threshold) -+ vma_rung_up(slot); -+ } -+ -+ slot->pages_bemerged = 0; -+ slot->pages_cowed = 0; -+ -+ list_del_init(&slot->dedup_list); -+ } -+} -+ -+static void uksm_del_vma_slot(struct vma_slot *slot) -+{ -+ int i, j; -+ struct rmap_list_entry *entry; -+ -+ if (slot->snode) { -+ /* -+ * In case it just failed when entering the rung, it's not -+ * necessary. -+ */ -+ rung_rm_slot(slot); -+ } -+ -+ if (!list_empty(&slot->dedup_list)) -+ list_del(&slot->dedup_list); -+ -+ if (!slot->rmap_list_pool || !slot->pool_counts) { -+ /* In case it OOMed in uksm_vma_enter() */ -+ goto out; -+ } -+ -+ for (i = 0; i < slot->pool_size; i++) { -+ void *addr; -+ -+ if (!slot->rmap_list_pool[i]) -+ continue; -+ -+ addr = kmap(slot->rmap_list_pool[i]); -+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { -+ entry = (struct rmap_list_entry *)addr + j; -+ if (is_addr(entry->addr)) -+ continue; -+ if (!entry->item) -+ continue; -+ -+ remove_rmap_item_from_tree(entry->item); -+ free_rmap_item(entry->item); -+ slot->pool_counts[i]--; -+ } -+ BUG_ON(slot->pool_counts[i]); -+ kunmap(slot->rmap_list_pool[i]); -+ __free_page(slot->rmap_list_pool[i]); -+ } -+ kfree(slot->rmap_list_pool); -+ kfree(slot->pool_counts); -+ -+out: -+ slot->rung = NULL; -+ BUG_ON(uksm_pages_total < slot->pages); -+ if (slot->flags & UKSM_SLOT_IN_UKSM) -+ uksm_pages_total -= slot->pages; -+ -+ if (slot->fully_scanned_round == fully_scanned_round) -+ scanned_virtual_pages -= slot->pages; -+ else -+ scanned_virtual_pages -= slot->pages_scanned; -+ free_vma_slot(slot); -+} -+ -+ -+#define SPIN_LOCK_PERIOD 32 -+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; -+static inline void cleanup_vma_slots(void) -+{ -+ struct vma_slot *slot; -+ int i; -+ -+ i = 0; -+ spin_lock(&vma_slot_list_lock); -+ while (!list_empty(&vma_slot_del)) { -+ slot = list_entry(vma_slot_del.next, -+ struct vma_slot, slot_list); -+ list_del(&slot->slot_list); -+ cleanup_slots[i++] = slot; -+ if (i == SPIN_LOCK_PERIOD) { -+ spin_unlock(&vma_slot_list_lock); -+ while (--i >= 0) -+ uksm_del_vma_slot(cleanup_slots[i]); -+ i = 0; -+ spin_lock(&vma_slot_list_lock); -+ } -+ } -+ spin_unlock(&vma_slot_list_lock); -+ -+ while (--i >= 0) -+ uksm_del_vma_slot(cleanup_slots[i]); -+} -+ -+/* -+*expotional moving average formula -+*/ -+static inline unsigned long ema(unsigned long curr, unsigned long last_ema) -+{ -+ /* -+ * For a very high burst, even the ema cannot work well, a false very -+ * high per-page time estimation can result in feedback in very high -+ * overhead of context swith and rung update -- this will then lead -+ * to higher per-paper time, this may not converge. -+ * -+ * Instead, we try to approach this value in a binary manner. -+ */ -+ if (curr > last_ema * 10) -+ return last_ema * 2; -+ -+ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; -+} -+ -+/* -+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to -+ * nanoseconds based on current uksm_sleep_jiffies. -+ */ -+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) -+{ -+ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / -+ (TIME_RATIO_SCALE - ratio) * ratio; -+} -+ -+ -+static inline unsigned long rung_real_ratio(int cpu_time_ratio) -+{ -+ unsigned long ret; -+ -+ BUG_ON(!cpu_time_ratio); -+ -+ if (cpu_time_ratio > 0) -+ ret = cpu_time_ratio; -+ else -+ ret = (unsigned long)(-cpu_time_ratio) * -+ uksm_max_cpu_percentage / 100UL; -+ -+ return ret ? ret : 1; -+} -+ -+static noinline void uksm_calc_scan_pages(void) -+{ -+ struct scan_rung *ladder = uksm_scan_ladder; -+ unsigned long sleep_usecs, nsecs; -+ unsigned long ratio; -+ int i; -+ unsigned long per_page; -+ -+ if (uksm_ema_page_time > 100000 || -+ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) -+ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; -+ -+ per_page = uksm_ema_page_time; -+ BUG_ON(!per_page); -+ -+ /* -+ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value -+ * based on saved user input. -+ */ -+ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) -+ uksm_sleep_jiffies = uksm_sleep_saved; -+ -+ /* We require a rung scan at least 1 page in a period. */ -+ nsecs = per_page; -+ ratio = rung_real_ratio(ladder[0].cpu_ratio); -+ if (cpu_ratio_to_nsec(ratio) < nsecs) { -+ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio -+ / NSEC_PER_USEC; -+ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ ratio = rung_real_ratio(ladder[i].cpu_ratio); -+ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / -+ per_page; -+ BUG_ON(!ladder[i].pages_to_scan); -+ uksm_calc_rung_step(&ladder[i], per_page, ratio); -+ } -+} -+ -+/* -+ * From the scan time of this round (ns) to next expected min sleep time -+ * (ms), be careful of the possible overflows. ratio is taken from -+ * rung_real_ratio() -+ */ -+static inline -+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) -+{ -+ scan_time >>= 20; /* to msec level now */ -+ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); -+ -+ return (unsigned int) ((unsigned long) scan_time * -+ (TIME_RATIO_SCALE - ratio) / ratio); -+} -+ -+#define __round_mask(x, y) ((__typeof__(x))((y)-1)) -+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) -+ -+static inline unsigned long vma_pool_size(struct vma_slot *slot) -+{ -+ return round_up(sizeof(struct rmap_list_entry) * slot->pages, -+ PAGE_SIZE) >> PAGE_SHIFT; -+} -+ -+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) -+{ -+ struct scan_rung *rung; -+ unsigned long pool_size, i; -+ struct vma_slot *slot; -+ int failed; -+ -+ rung = &uksm_scan_ladder[0]; -+ -+ failed = 0; -+ for (i = 0; i < num; i++) { -+ slot = slots[i]; -+ -+ pool_size = vma_pool_size(slot); -+ slot->rmap_list_pool = kzalloc(sizeof(struct page *) * -+ pool_size, GFP_KERNEL); -+ if (!slot->rmap_list_pool) -+ break; -+ -+ slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, -+ GFP_KERNEL); -+ if (!slot->pool_counts) { -+ kfree(slot->rmap_list_pool); -+ break; -+ } -+ -+ slot->pool_size = pool_size; -+ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); -+ slot->flags |= UKSM_SLOT_IN_UKSM; -+ uksm_pages_total += slot->pages; -+ } -+ -+ if (i) -+ rung_add_new_slots(rung, slots, i); -+ -+ return; -+} -+ -+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; -+ -+static void uksm_enter_all_slots(void) -+{ -+ struct vma_slot *slot; -+ unsigned long index; -+ struct list_head empty_vma_list; -+ int i; -+ -+ i = 0; -+ index = 0; -+ INIT_LIST_HEAD(&empty_vma_list); -+ -+ spin_lock(&vma_slot_list_lock); -+ while (!list_empty(&vma_slot_new)) { -+ slot = list_entry(vma_slot_new.next, -+ struct vma_slot, slot_list); -+ -+ if (!slot->vma->anon_vma) { -+ list_move(&slot->slot_list, &empty_vma_list); -+ } else if (vma_can_enter(slot->vma)) { -+ batch_slots[index++] = slot; -+ list_del_init(&slot->slot_list); -+ } else { -+ list_move(&slot->slot_list, &vma_slot_noadd); -+ } -+ -+ if (++i == SPIN_LOCK_PERIOD || -+ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { -+ spin_unlock(&vma_slot_list_lock); -+ -+ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { -+ uksm_vma_enter(batch_slots, index); -+ index = 0; -+ } -+ i = 0; -+ cond_resched(); -+ spin_lock(&vma_slot_list_lock); -+ } -+ } -+ -+ list_splice(&empty_vma_list, &vma_slot_new); -+ -+ spin_unlock(&vma_slot_list_lock); -+ -+ if (index) -+ uksm_vma_enter(batch_slots, index); -+ -+} -+ -+static inline int rung_round_finished(struct scan_rung *rung) -+{ -+ return rung->flags & UKSM_RUNG_ROUND_FINISHED; -+} -+ -+static inline void judge_slot(struct vma_slot *slot) -+{ -+ struct scan_rung *rung = slot->rung; -+ unsigned long dedup; -+ int deleted; -+ -+ dedup = cal_dedup_ratio(slot); -+ if (vma_fully_scanned(slot) && uksm_thrash_threshold) -+ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); -+ else if (dedup && dedup >= uksm_abundant_threshold) -+ deleted = vma_rung_up(slot); -+ else -+ deleted = vma_rung_down(slot); -+ -+ slot->pages_merged = 0; -+ slot->pages_cowed = 0; -+ -+ if (vma_fully_scanned(slot)) -+ slot->pages_scanned = 0; -+ -+ slot->last_scanned = slot->pages_scanned; -+ -+ /* If its deleted in above, then rung was already advanced. */ -+ if (!deleted) -+ advance_current_scan(rung); -+} -+ -+ -+static inline int hash_round_finished(void) -+{ -+ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { -+ scanned_virtual_pages = 0; -+ if (uksm_pages_scanned) -+ fully_scanned_round++; -+ -+ return 1; -+ } else { -+ return 0; -+ } -+} -+ -+#define UKSM_MMSEM_BATCH 5 -+#define BUSY_RETRY 100 -+ -+/** -+ * uksm_do_scan() - the main worker function. -+ */ -+static noinline void uksm_do_scan(void) -+{ -+ struct vma_slot *slot, *iter; -+ struct mm_struct *busy_mm; -+ unsigned char round_finished, all_rungs_emtpy; -+ int i, err, mmsem_batch; -+ unsigned long pcost; -+ long long delta_exec; -+ unsigned long vpages, max_cpu_ratio; -+ unsigned long long start_time, end_time, scan_time; -+ unsigned int expected_jiffies; -+ -+ might_sleep(); -+ -+ vpages = 0; -+ -+ start_time = task_sched_runtime(current); -+ max_cpu_ratio = 0; -+ mmsem_batch = 0; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE;) { -+ struct scan_rung *rung = &uksm_scan_ladder[i]; -+ unsigned long ratio; -+ int busy_retry; -+ -+ if (!rung->pages_to_scan) { -+ i++; -+ continue; -+ } -+ -+ if (!rung->vma_root.num) { -+ rung->pages_to_scan = 0; -+ i++; -+ continue; -+ } -+ -+ ratio = rung_real_ratio(rung->cpu_ratio); -+ if (ratio > max_cpu_ratio) -+ max_cpu_ratio = ratio; -+ -+ busy_retry = BUSY_RETRY; -+ /* -+ * Do not consider rung_round_finished() here, just used up the -+ * rung->pages_to_scan quota. -+ */ -+ while (rung->pages_to_scan && rung->vma_root.num && -+ likely(!freezing(current))) { -+ int reset = 0; -+ -+ slot = rung->current_scan; -+ -+ BUG_ON(vma_fully_scanned(slot)); -+ -+ if (mmsem_batch) { -+ err = 0; -+ } else { -+ err = try_down_read_slot_mmap_sem(slot); -+ } -+ -+ if (err == -ENOENT) { -+rm_slot: -+ rung_rm_slot(slot); -+ continue; -+ } -+ -+ busy_mm = slot->mm; -+ -+ if (err == -EBUSY) { -+ /* skip other vmas on the same mm */ -+ do { -+ reset = advance_current_scan(rung); -+ iter = rung->current_scan; -+ busy_retry--; -+ if (iter->vma->vm_mm != busy_mm || -+ !busy_retry || reset) -+ break; -+ } while (1); -+ -+ if (iter->vma->vm_mm != busy_mm) { -+ continue; -+ } else { -+ /* scan round finsished */ -+ break; -+ } -+ } -+ -+ BUG_ON(!vma_can_enter(slot->vma)); -+ if (uksm_test_exit(slot->vma->vm_mm)) { -+ mmsem_batch = 0; -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ goto rm_slot; -+ } -+ -+ if (mmsem_batch) -+ mmsem_batch--; -+ else -+ mmsem_batch = UKSM_MMSEM_BATCH; -+ -+ /* Ok, we have take the mmap_sem, ready to scan */ -+ scan_vma_one_page(slot); -+ rung->pages_to_scan--; -+ vpages++; -+ -+ if (rung->current_offset + rung->step > slot->pages - 1 -+ || vma_fully_scanned(slot)) { -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ judge_slot(slot); -+ mmsem_batch = 0; -+ } else { -+ rung->current_offset += rung->step; -+ if (!mmsem_batch) -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ } -+ -+ busy_retry = BUSY_RETRY; -+ cond_resched(); -+ } -+ -+ if (mmsem_batch) { -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ mmsem_batch = 0; -+ } -+ -+ if (freezing(current)) -+ break; -+ -+ cond_resched(); -+ } -+ end_time = task_sched_runtime(current); -+ delta_exec = end_time - start_time; -+ -+ if (freezing(current)) -+ return; -+ -+ cleanup_vma_slots(); -+ uksm_enter_all_slots(); -+ -+ round_finished = 1; -+ all_rungs_emtpy = 1; -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ struct scan_rung *rung = &uksm_scan_ladder[i]; -+ -+ if (rung->vma_root.num) { -+ all_rungs_emtpy = 0; -+ if (!rung_round_finished(rung)) -+ round_finished = 0; -+ } -+ } -+ -+ if (all_rungs_emtpy) -+ round_finished = 0; -+ -+ if (round_finished) { -+ round_update_ladder(); -+ uksm_eval_round++; -+ -+ if (hash_round_finished() && rshash_adjust()) { -+ /* Reset the unstable root iff hash strength changed */ -+ uksm_hash_round++; -+ root_unstable_tree = RB_ROOT; -+ free_all_tree_nodes(&unstable_tree_node_list); -+ } -+ -+ /* -+ * A number of pages can hang around indefinitely on per-cpu -+ * pagevecs, raised page count preventing write_protect_page -+ * from merging them. Though it doesn't really matter much, -+ * it is puzzling to see some stuck in pages_volatile until -+ * other activity jostles them out, and they also prevented -+ * LTP's KSM test from succeeding deterministically; so drain -+ * them here (here rather than on entry to uksm_do_scan(), -+ * so we don't IPI too often when pages_to_scan is set low). -+ */ -+ lru_add_drain_all(); -+ } -+ -+ -+ if (vpages && delta_exec > 0) { -+ pcost = (unsigned long) delta_exec / vpages; -+ if (likely(uksm_ema_page_time)) -+ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); -+ else -+ uksm_ema_page_time = pcost; -+ } -+ -+ uksm_calc_scan_pages(); -+ uksm_sleep_real = uksm_sleep_jiffies; -+ /* in case of radical cpu bursts, apply the upper bound */ -+ end_time = task_sched_runtime(current); -+ if (max_cpu_ratio && end_time > start_time) { -+ scan_time = end_time - start_time; -+ expected_jiffies = msecs_to_jiffies( -+ scan_time_to_sleep(scan_time, max_cpu_ratio)); -+ -+ if (expected_jiffies > uksm_sleep_real) -+ uksm_sleep_real = expected_jiffies; -+ -+ /* We have a 1 second up bound for responsiveness. */ -+ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) -+ uksm_sleep_real = msecs_to_jiffies(1000); -+ } -+ -+ return; -+} -+ -+static int ksmd_should_run(void) -+{ -+ return uksm_run & UKSM_RUN_MERGE; -+} -+ -+static int uksm_scan_thread(void *nothing) -+{ -+ set_freezable(); -+ set_user_nice(current, 5); -+ -+ while (!kthread_should_stop()) { -+ mutex_lock(&uksm_thread_mutex); -+ if (ksmd_should_run()) { -+ uksm_do_scan(); -+ } -+ mutex_unlock(&uksm_thread_mutex); -+ -+ try_to_freeze(); -+ -+ if (ksmd_should_run()) { -+ schedule_timeout_interruptible(uksm_sleep_real); -+ uksm_sleep_times++; -+ } else { -+ wait_event_freezable(uksm_thread_wait, -+ ksmd_should_run() || kthread_should_stop()); -+ } -+ } -+ return 0; -+} -+ -+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, -+ unsigned long *vm_flags) -+{ -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ unsigned int mapcount = page_mapcount(page); -+ int referenced = 0; -+ int search_new_forks = 0; -+ unsigned long address; -+ -+ VM_BUG_ON(!PageKsm(page)); -+ VM_BUG_ON(!PageLocked(page)); -+ -+ stable_node = page_stable_node(page); -+ if (!stable_node) -+ return 0; -+ -+ -+again: -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ struct anon_vma_chain *vmac; -+ struct vm_area_struct *vma; -+ -+ anon_vma_lock_read(anon_vma); -+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, -+ 0, ULONG_MAX) { -+ -+ vma = vmac->vma; -+ address = get_rmap_addr(rmap_item); -+ -+ if (address < vma->vm_start || -+ address >= vma->vm_end) -+ continue; -+ /* -+ * Initially we examine only the vma which -+ * covers this rmap_item; but later, if there -+ * is still work to do, we examine covering -+ * vmas in other mms: in case they were forked -+ * from the original since ksmd passed. -+ */ -+ if ((rmap_item->slot->vma == vma) == -+ search_new_forks) -+ continue; -+ -+ if (memcg && -+ !mm_match_cgroup(vma->vm_mm, memcg)) -+ continue; -+ -+ referenced += -+ page_referenced_one(page, vma, -+ address, &mapcount, vm_flags); -+ if (!search_new_forks || !mapcount) -+ break; -+ } -+ -+ anon_vma_unlock_read(anon_vma); -+ if (!mapcount) -+ goto out; -+ } -+ } -+ if (!search_new_forks++) -+ goto again; -+out: -+ return referenced; -+} -+ -+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -+{ -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ int ret = SWAP_AGAIN; -+ int search_new_forks = 0; -+ unsigned long address; -+ -+ VM_BUG_ON(!PageKsm(page)); -+ VM_BUG_ON(!PageLocked(page)); -+ -+ stable_node = page_stable_node(page); -+ if (!stable_node) -+ return SWAP_FAIL; -+again: -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ struct anon_vma_chain *vmac; -+ struct vm_area_struct *vma; -+ -+ anon_vma_lock_read(anon_vma); -+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, -+ 0, ULONG_MAX) { -+ vma = vmac->vma; -+ address = get_rmap_addr(rmap_item); -+ -+ if (address < vma->vm_start || -+ address >= vma->vm_end) -+ continue; -+ /* -+ * Initially we examine only the vma which -+ * covers this rmap_item; but later, if there -+ * is still work to do, we examine covering -+ * vmas in other mms: in case they were forked -+ * from the original since ksmd passed. -+ */ -+ if ((rmap_item->slot->vma == vma) == -+ search_new_forks) -+ continue; -+ -+ ret = try_to_unmap_one(page, vma, -+ address, flags); -+ if (ret != SWAP_AGAIN || !page_mapped(page)) { -+ anon_vma_unlock_read(anon_vma); -+ goto out; -+ } -+ } -+ anon_vma_unlock_read(anon_vma); -+ } -+ } -+ if (!search_new_forks++) -+ goto again; -+out: -+ return ret; -+} -+ -+#ifdef CONFIG_MIGRATION -+int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, -+ struct vm_area_struct *, unsigned long, void *), void *arg) -+{ -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ int ret = SWAP_AGAIN; -+ int search_new_forks = 0; -+ unsigned long address; -+ -+ VM_BUG_ON(!PageKsm(page)); -+ VM_BUG_ON(!PageLocked(page)); -+ -+ stable_node = page_stable_node(page); -+ if (!stable_node) -+ return ret; -+again: -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ struct anon_vma_chain *vmac; -+ struct vm_area_struct *vma; -+ -+ anon_vma_lock_read(anon_vma); -+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, -+ 0, ULONG_MAX) { -+ vma = vmac->vma; -+ address = get_rmap_addr(rmap_item); -+ -+ if (address < vma->vm_start || -+ address >= vma->vm_end) -+ continue; -+ -+ if ((rmap_item->slot->vma == vma) == -+ search_new_forks) -+ continue; -+ -+ ret = rmap_one(page, vma, address, arg); -+ if (ret != SWAP_AGAIN) { -+ anon_vma_unlock_read(anon_vma); -+ goto out; -+ } -+ } -+ anon_vma_unlock_read(anon_vma); -+ } -+ } -+ if (!search_new_forks++) -+ goto again; -+out: -+ return ret; -+} -+ -+/* Common ksm interface but may be specific to uksm */ -+void ksm_migrate_page(struct page *newpage, struct page *oldpage) -+{ -+ struct stable_node *stable_node; -+ -+ VM_BUG_ON(!PageLocked(oldpage)); -+ VM_BUG_ON(!PageLocked(newpage)); -+ VM_BUG_ON(newpage->mapping != oldpage->mapping); -+ -+ stable_node = page_stable_node(newpage); -+ if (stable_node) { -+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); -+ stable_node->kpfn = page_to_pfn(newpage); -+ } -+} -+#endif /* CONFIG_MIGRATION */ -+ -+#ifdef CONFIG_MEMORY_HOTREMOVE -+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, -+ unsigned long end_pfn) -+{ -+ struct rb_node *node; -+ -+ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { -+ struct stable_node *stable_node; -+ -+ stable_node = rb_entry(node, struct stable_node, node); -+ if (stable_node->kpfn >= start_pfn && -+ stable_node->kpfn < end_pfn) -+ return stable_node; -+ } -+ return NULL; -+} -+ -+static int uksm_memory_callback(struct notifier_block *self, -+ unsigned long action, void *arg) -+{ -+ struct memory_notify *mn = arg; -+ struct stable_node *stable_node; -+ -+ switch (action) { -+ case MEM_GOING_OFFLINE: -+ /* -+ * Keep it very simple for now: just lock out ksmd and -+ * MADV_UNMERGEABLE while any memory is going offline. -+ * mutex_lock_nested() is necessary because lockdep was alarmed -+ * that here we take uksm_thread_mutex inside notifier chain -+ * mutex, and later take notifier chain mutex inside -+ * uksm_thread_mutex to unlock it. But that's safe because both -+ * are inside mem_hotplug_mutex. -+ */ -+ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); -+ break; -+ -+ case MEM_OFFLINE: -+ /* -+ * Most of the work is done by page migration; but there might -+ * be a few stable_nodes left over, still pointing to struct -+ * pages which have been offlined: prune those from the tree. -+ */ -+ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, -+ mn->start_pfn + mn->nr_pages)) != NULL) -+ remove_node_from_stable_tree(stable_node, 1, 1); -+ /* fallthrough */ -+ -+ case MEM_CANCEL_OFFLINE: -+ mutex_unlock(&uksm_thread_mutex); -+ break; -+ } -+ return NOTIFY_OK; -+} -+#endif /* CONFIG_MEMORY_HOTREMOVE */ -+ -+#ifdef CONFIG_SYSFS -+/* -+ * This all compiles without CONFIG_SYSFS, but is a waste of space. -+ */ -+ -+#define UKSM_ATTR_RO(_name) \ -+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) -+#define UKSM_ATTR(_name) \ -+ static struct kobj_attribute _name##_attr = \ -+ __ATTR(_name, 0644, _name##_show, _name##_store) -+ -+static ssize_t max_cpu_percentage_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); -+} -+ -+static ssize_t max_cpu_percentage_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned long max_cpu_percentage; -+ int err; -+ -+ err = strict_strtoul(buf, 10, &max_cpu_percentage); -+ if (err || max_cpu_percentage > 100) -+ return -EINVAL; -+ -+ if (max_cpu_percentage == 100) -+ max_cpu_percentage = 99; -+ else if (max_cpu_percentage < 10) -+ max_cpu_percentage = 10; -+ -+ uksm_max_cpu_percentage = max_cpu_percentage; -+ -+ return count; -+} -+UKSM_ATTR(max_cpu_percentage); -+ -+static ssize_t sleep_millisecs_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); -+} -+ -+static ssize_t sleep_millisecs_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned long msecs; -+ int err; -+ -+ err = strict_strtoul(buf, 10, &msecs); -+ if (err || msecs > MSEC_PER_SEC) -+ return -EINVAL; -+ -+ uksm_sleep_jiffies = msecs_to_jiffies(msecs); -+ uksm_sleep_saved = uksm_sleep_jiffies; -+ -+ return count; -+} -+UKSM_ATTR(sleep_millisecs); -+ -+ -+static ssize_t cpu_governor_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); -+ int i; -+ -+ buf[0] = '\0'; -+ for (i = 0; i < n ; i++) { -+ if (uksm_cpu_governor == i) -+ strcat(buf, "["); -+ -+ strcat(buf, uksm_cpu_governor_str[i]); -+ -+ if (uksm_cpu_governor == i) -+ strcat(buf, "]"); -+ -+ strcat(buf, " "); -+ } -+ strcat(buf, "\n"); -+ -+ return strlen(buf); -+} -+ -+static inline void init_performance_values(void) -+{ -+ int i; -+ struct scan_rung *rung; -+ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; -+ -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = uksm_scan_ladder + i; -+ rung->cpu_ratio = preset->cpu_ratio[i]; -+ rung->cover_msecs = preset->cover_msecs[i]; -+ } -+ -+ uksm_max_cpu_percentage = preset->max_cpu; -+} -+ -+static ssize_t cpu_governor_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); -+ -+ for (n--; n >=0 ; n--) { -+ if (!strncmp(buf, uksm_cpu_governor_str[n], -+ strlen(uksm_cpu_governor_str[n]))) -+ break; -+ } -+ -+ if (n < 0) -+ return -EINVAL; -+ else -+ uksm_cpu_governor = n; -+ -+ init_performance_values(); -+ -+ return count; -+} -+UKSM_ATTR(cpu_governor); -+ -+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, -+ char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_run); -+} -+ -+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = strict_strtoul(buf, 10, &flags); -+ if (err || flags > UINT_MAX) -+ return -EINVAL; -+ if (flags > UKSM_RUN_MERGE) -+ return -EINVAL; -+ -+ mutex_lock(&uksm_thread_mutex); -+ if (uksm_run != flags) { -+ uksm_run = flags; -+ } -+ mutex_unlock(&uksm_thread_mutex); -+ -+ if (flags & UKSM_RUN_MERGE) -+ wake_up_interruptible(&uksm_thread_wait); -+ -+ return count; -+} -+UKSM_ATTR(run); -+ -+static ssize_t abundant_threshold_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_abundant_threshold); -+} -+ -+static ssize_t abundant_threshold_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = strict_strtoul(buf, 10, &flags); -+ if (err || flags > 99) -+ return -EINVAL; -+ -+ uksm_abundant_threshold = flags; -+ -+ return count; -+} -+UKSM_ATTR(abundant_threshold); -+ -+static ssize_t thrash_threshold_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_thrash_threshold); -+} -+ -+static ssize_t thrash_threshold_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = strict_strtoul(buf, 10, &flags); -+ if (err || flags > 99) -+ return -EINVAL; -+ -+ uksm_thrash_threshold = flags; -+ -+ return count; -+} -+UKSM_ATTR(thrash_threshold); -+ -+static ssize_t cpu_ratios_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int i, size; -+ struct scan_rung *rung; -+ char *p = buf; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ if (rung->cpu_ratio > 0) -+ size = sprintf(p, "%d ", rung->cpu_ratio); -+ else -+ size = sprintf(p, "MAX/%d ", -+ TIME_RATIO_SCALE / -rung->cpu_ratio); -+ -+ p += size; -+ } -+ -+ *p++ = '\n'; -+ *p = '\0'; -+ -+ return p - buf; -+} -+ -+static ssize_t cpu_ratios_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int i, cpuratios[SCAN_LADDER_SIZE], err; -+ unsigned long value; -+ struct scan_rung *rung; -+ char *p, *end = NULL; -+ -+ p = kzalloc(count, GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ -+ memcpy(p, buf, count); -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ if (i != SCAN_LADDER_SIZE -1) { -+ end = strchr(p, ' '); -+ if (!end) -+ return -EINVAL; -+ -+ *end = '\0'; -+ } -+ -+ if (strstr(p, "MAX/")) { -+ p = strchr(p, '/') + 1; -+ err = strict_strtoul(p, 10, &value); -+ if (err || value > TIME_RATIO_SCALE || !value) -+ return -EINVAL; -+ -+ cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); -+ } else { -+ err = strict_strtoul(p, 10, &value); -+ if (err || value > TIME_RATIO_SCALE || !value) -+ return -EINVAL; -+ -+ cpuratios[i] = value; -+ } -+ -+ p = end + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ rung->cpu_ratio = cpuratios[i]; -+ } -+ -+ return count; -+} -+UKSM_ATTR(cpu_ratios); -+ -+static ssize_t eval_intervals_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int i, size; -+ struct scan_rung *rung; -+ char *p = buf; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ size = sprintf(p, "%u ", rung->cover_msecs); -+ p += size; -+ } -+ -+ *p++ = '\n'; -+ *p = '\0'; -+ -+ return p - buf; -+} -+ -+static ssize_t eval_intervals_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int i, err; -+ unsigned long values[SCAN_LADDER_SIZE]; -+ struct scan_rung *rung; -+ char *p, *end = NULL; -+ -+ p = kzalloc(count, GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ -+ memcpy(p, buf, count); -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ if (i != SCAN_LADDER_SIZE -1) { -+ end = strchr(p, ' '); -+ if (!end) -+ return -EINVAL; -+ -+ *end = '\0'; -+ } -+ -+ err = strict_strtoul(p, 10, &values[i]); -+ if (err) -+ return -EINVAL; -+ -+ p = end + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ rung->cover_msecs = values[i]; -+ } -+ -+ return count; -+} -+UKSM_ATTR(eval_intervals); -+ -+static ssize_t ema_per_page_time_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_ema_page_time); -+} -+UKSM_ATTR_RO(ema_per_page_time); -+ -+static ssize_t pages_shared_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_shared); -+} -+UKSM_ATTR_RO(pages_shared); -+ -+static ssize_t pages_sharing_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_sharing); -+} -+UKSM_ATTR_RO(pages_sharing); -+ -+static ssize_t pages_unshared_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_unshared); -+} -+UKSM_ATTR_RO(pages_unshared); -+ -+static ssize_t full_scans_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%llu\n", fully_scanned_round); -+} -+UKSM_ATTR_RO(full_scans); -+ -+static ssize_t pages_scanned_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ unsigned long base = 0; -+ u64 delta, ret; -+ -+ if (pages_scanned_stored) { -+ base = pages_scanned_base; -+ ret = pages_scanned_stored; -+ delta = uksm_pages_scanned >> base; -+ if (CAN_OVERFLOW_U64(ret, delta)) { -+ ret >>= 1; -+ delta >>= 1; -+ base++; -+ ret += delta; -+ } -+ } else { -+ ret = uksm_pages_scanned; -+ } -+ -+ while (ret > ULONG_MAX) { -+ ret >>= 1; -+ base++; -+ } -+ -+ if (base) -+ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); -+ else -+ return sprintf(buf, "%lu\n", (unsigned long)ret); -+} -+UKSM_ATTR_RO(pages_scanned); -+ -+static ssize_t hash_strength_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", hash_strength); -+} -+UKSM_ATTR_RO(hash_strength); -+ -+static ssize_t sleep_times_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%llu\n", uksm_sleep_times); -+} -+UKSM_ATTR_RO(sleep_times); -+ -+ -+static struct attribute *uksm_attrs[] = { -+ &max_cpu_percentage_attr.attr, -+ &sleep_millisecs_attr.attr, -+ &cpu_governor_attr.attr, -+ &run_attr.attr, -+ &ema_per_page_time_attr.attr, -+ &pages_shared_attr.attr, -+ &pages_sharing_attr.attr, -+ &pages_unshared_attr.attr, -+ &full_scans_attr.attr, -+ &pages_scanned_attr.attr, -+ &hash_strength_attr.attr, -+ &sleep_times_attr.attr, -+ &thrash_threshold_attr.attr, -+ &abundant_threshold_attr.attr, -+ &cpu_ratios_attr.attr, -+ &eval_intervals_attr.attr, -+ NULL, -+}; -+ -+static struct attribute_group uksm_attr_group = { -+ .attrs = uksm_attrs, -+ .name = "uksm", -+}; -+#endif /* CONFIG_SYSFS */ -+ -+static inline void init_scan_ladder(void) -+{ -+ int i; -+ struct scan_rung *rung; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = uksm_scan_ladder + i; -+ slot_tree_init_root(&rung->vma_root); -+ } -+ -+ init_performance_values(); -+ uksm_calc_scan_pages(); -+} -+ -+static inline int cal_positive_negative_costs(void) -+{ -+ struct page *p1, *p2; -+ unsigned char *addr1, *addr2; -+ unsigned long i, time_start, hash_cost; -+ unsigned long loopnum = 0; -+ -+ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ -+ volatile u32 hash; -+ volatile int ret; -+ -+ p1 = alloc_page(GFP_KERNEL); -+ if (!p1) -+ return -ENOMEM; -+ -+ p2 = alloc_page(GFP_KERNEL); -+ if (!p2) -+ return -ENOMEM; -+ -+ addr1 = kmap_atomic(p1); -+ addr2 = kmap_atomic(p2); -+ memset(addr1, prandom_u32(), PAGE_SIZE); -+ memcpy(addr2, addr1, PAGE_SIZE); -+ -+ /* make sure that the two pages differ in last byte */ -+ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; -+ kunmap_atomic(addr2); -+ kunmap_atomic(addr1); -+ -+ time_start = jiffies; -+ while (jiffies - time_start < 100) { -+ for (i = 0; i < 100; i++) -+ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); -+ loopnum += 100; -+ } -+ hash_cost = (jiffies - time_start); -+ -+ time_start = jiffies; -+ for (i = 0; i < loopnum; i++) -+ ret = pages_identical(p1, p2); -+ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); -+ memcmp_cost /= hash_cost; -+ printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " -+ "hash=%u cmp_ret=%d.\n", -+ memcmp_cost, hash, ret); -+ -+ __free_page(p1); -+ __free_page(p2); -+ return 0; -+} -+ -+static int init_zeropage_hash_table(void) -+{ -+ struct page *page; -+ char *addr; -+ int i; -+ -+ page = alloc_page(GFP_KERNEL); -+ if (!page) -+ return -ENOMEM; -+ -+ addr = kmap_atomic(page); -+ memset(addr, 0, PAGE_SIZE); -+ kunmap_atomic(addr); -+ -+ zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), -+ GFP_KERNEL); -+ if (!zero_hash_table) -+ return -ENOMEM; -+ -+ for (i = 0; i < HASH_STRENGTH_MAX; i++) -+ zero_hash_table[i] = page_hash(page, i, 0); -+ -+ __free_page(page); -+ -+ return 0; -+} -+ -+static inline int init_random_sampling(void) -+{ -+ unsigned long i; -+ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); -+ if (!random_nums) -+ return -ENOMEM; -+ -+ for (i = 0; i < HASH_STRENGTH_FULL; i++) -+ random_nums[i] = i; -+ -+ for (i = 0; i < HASH_STRENGTH_FULL; i++) { -+ unsigned long rand_range, swap_index, tmp; -+ -+ rand_range = HASH_STRENGTH_FULL - i; -+ swap_index = i + prandom_u32() % rand_range; -+ tmp = random_nums[i]; -+ random_nums[i] = random_nums[swap_index]; -+ random_nums[swap_index] = tmp; -+ } -+ -+ rshash_state.state = RSHASH_NEW; -+ rshash_state.below_count = 0; -+ rshash_state.lookup_window_index = 0; -+ -+ return cal_positive_negative_costs(); -+} -+ -+static int __init uksm_slab_init(void) -+{ -+ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); -+ if (!rmap_item_cache) -+ goto out; -+ -+ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); -+ if (!stable_node_cache) -+ goto out_free1; -+ -+ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); -+ if (!node_vma_cache) -+ goto out_free2; -+ -+ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); -+ if (!vma_slot_cache) -+ goto out_free3; -+ -+ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); -+ if (!tree_node_cache) -+ goto out_free4; -+ -+ return 0; -+ -+out_free4: -+ kmem_cache_destroy(vma_slot_cache); -+out_free3: -+ kmem_cache_destroy(node_vma_cache); -+out_free2: -+ kmem_cache_destroy(stable_node_cache); -+out_free1: -+ kmem_cache_destroy(rmap_item_cache); -+out: -+ return -ENOMEM; -+} -+ -+static void __init uksm_slab_free(void) -+{ -+ kmem_cache_destroy(stable_node_cache); -+ kmem_cache_destroy(rmap_item_cache); -+ kmem_cache_destroy(node_vma_cache); -+ kmem_cache_destroy(vma_slot_cache); -+ kmem_cache_destroy(tree_node_cache); -+} -+ -+/* Common interface to ksm, different to it. */ -+int ksm_madvise(struct vm_area_struct *vma, unsigned long start, -+ unsigned long end, int advice, unsigned long *vm_flags) -+{ -+ int err; -+ -+ switch (advice) { -+ case MADV_MERGEABLE: -+ return 0; /* just ignore the advice */ -+ -+ case MADV_UNMERGEABLE: -+ if (!(*vm_flags & VM_MERGEABLE)) -+ return 0; /* just ignore the advice */ -+ -+ if (vma->anon_vma) { -+ err = unmerge_uksm_pages(vma, start, end); -+ if (err) -+ return err; -+ } -+ -+ uksm_remove_vma(vma); -+ *vm_flags &= ~VM_MERGEABLE; -+ break; -+ } -+ -+ return 0; -+} -+ -+/* Common interface to ksm, actually the same. */ -+struct page *ksm_might_need_to_copy(struct page *page, -+ struct vm_area_struct *vma, unsigned long address) -+{ -+ struct anon_vma *anon_vma = page_anon_vma(page); -+ struct page *new_page; -+ -+ if (PageKsm(page)) { -+ if (page_stable_node(page)) -+ return page; /* no need to copy it */ -+ } else if (!anon_vma) { -+ return page; /* no need to copy it */ -+ } else if (anon_vma->root == vma->anon_vma->root && -+ page->index == linear_page_index(vma, address)) { -+ return page; /* still no need to copy it */ -+ } -+ if (!PageUptodate(page)) -+ return page; /* let do_swap_page report the error */ -+ -+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); -+ if (new_page) { -+ copy_user_highpage(new_page, page, address, vma); -+ -+ SetPageDirty(new_page); -+ __SetPageUptodate(new_page); -+ __set_page_locked(new_page); -+ } -+ -+ return new_page; -+} -+ -+static int __init uksm_init(void) -+{ -+ struct task_struct *uksm_thread; -+ int err; -+ -+ uksm_sleep_jiffies = msecs_to_jiffies(100); -+ uksm_sleep_saved = uksm_sleep_jiffies; -+ -+ slot_tree_init(); -+ init_scan_ladder(); -+ -+ -+ err = init_random_sampling(); -+ if (err) -+ goto out_free2; -+ -+ err = uksm_slab_init(); -+ if (err) -+ goto out_free1; -+ -+ err = init_zeropage_hash_table(); -+ if (err) -+ goto out_free0; -+ -+ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); -+ if (IS_ERR(uksm_thread)) { -+ printk(KERN_ERR "uksm: creating kthread failed\n"); -+ err = PTR_ERR(uksm_thread); -+ goto out_free; -+ } -+ -+#ifdef CONFIG_SYSFS -+ err = sysfs_create_group(mm_kobj, &uksm_attr_group); -+ if (err) { -+ printk(KERN_ERR "uksm: register sysfs failed\n"); -+ kthread_stop(uksm_thread); -+ goto out_free; -+ } -+#else -+ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ -+ -+#endif /* CONFIG_SYSFS */ -+ -+#ifdef CONFIG_MEMORY_HOTREMOVE -+ /* -+ * Choose a high priority since the callback takes uksm_thread_mutex: -+ * later callbacks could only be taking locks which nest within that. -+ */ -+ hotplug_memory_notifier(uksm_memory_callback, 100); -+#endif -+ return 0; -+ -+out_free: -+ kfree(zero_hash_table); -+out_free0: -+ uksm_slab_free(); -+out_free1: -+ kfree(random_nums); -+out_free2: -+ kfree(uksm_scan_ladder); -+ return err; -+} -+ -+#ifdef MODULE -+module_init(uksm_init) -+#else -+late_initcall(uksm_init); -+#endif -+ -diff --git a/mm/vmstat.c b/mm/vmstat.c -index 7249614..3cbe5ba 100644 ---- a/mm/vmstat.c -+++ b/mm/vmstat.c -@@ -772,6 +772,9 @@ const char * const vmstat_text[] = { - #endif - "nr_anon_transparent_hugepages", - "nr_free_cma", -+#ifdef CONFIG_UKSM -+ "nr_uksm_zero_pages", -+#endif - "nr_dirty_threshold", - "nr_dirty_background_threshold", -